import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,recall_score,accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression , LogisticRegressionCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
%matplotlib inline
np.random.seed(0)
os.getcwd()
'C:\\Users\\Varsha\\Applications of software'
os.chdir('C:\AOS')
os.getcwd()
'C:\\AOS'
churn=pd.read_csv('churn.csv')
churn.head()
| mobile_number | circle_id | loc_og_t2o_mou | std_og_t2o_mou | loc_ic_t2o_mou | last_date_of_month_6 | last_date_of_month_7 | last_date_of_month_8 | last_date_of_month_9 | arpu_6 | ... | sachet_3g_9 | fb_user_6 | fb_user_7 | fb_user_8 | fb_user_9 | aon | aug_vbc_3g | jul_vbc_3g | jun_vbc_3g | sep_vbc_3g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7000842753 | 109 | 0.0 | 0.0 | 0.0 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 197.385 | ... | 0 | 1.0 | 1.0 | 1.0 | NaN | 968 | 30.4 | 0.0 | 101.20 | 3.58 |
| 1 | 7001865778 | 109 | 0.0 | 0.0 | 0.0 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 34.047 | ... | 0 | NaN | 1.0 | 1.0 | NaN | 1006 | 0.0 | 0.0 | 0.00 | 0.00 |
| 2 | 7001625959 | 109 | 0.0 | 0.0 | 0.0 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 167.690 | ... | 0 | NaN | NaN | NaN | 1.0 | 1103 | 0.0 | 0.0 | 4.17 | 0.00 |
| 3 | 7001204172 | 109 | 0.0 | 0.0 | 0.0 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 221.338 | ... | 0 | NaN | NaN | NaN | NaN | 2491 | 0.0 | 0.0 | 0.00 | 0.00 |
| 4 | 7000142493 | 109 | 0.0 | 0.0 | 0.0 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 261.636 | ... | 0 | 0.0 | NaN | NaN | NaN | 1526 | 0.0 | 0.0 | 0.00 | 0.00 |
5 rows × 226 columns
print(churn.shape)
print(churn.info())
(99999, 226) <class 'pandas.core.frame.DataFrame'> RangeIndex: 99999 entries, 0 to 99998 Columns: 226 entries, mobile_number to sep_vbc_3g dtypes: float64(179), int64(35), object(12) memory usage: 172.4+ MB None
churn.columns.values
array(['mobile_number', 'circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou',
'loc_ic_t2o_mou', 'last_date_of_month_6', 'last_date_of_month_7',
'last_date_of_month_8', 'last_date_of_month_9', 'arpu_6', 'arpu_7',
'arpu_8', 'arpu_9', 'onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8',
'onnet_mou_9', 'offnet_mou_6', 'offnet_mou_7', 'offnet_mou_8',
'offnet_mou_9', 'roam_ic_mou_6', 'roam_ic_mou_7', 'roam_ic_mou_8',
'roam_ic_mou_9', 'roam_og_mou_6', 'roam_og_mou_7', 'roam_og_mou_8',
'roam_og_mou_9', 'loc_og_t2t_mou_6', 'loc_og_t2t_mou_7',
'loc_og_t2t_mou_8', 'loc_og_t2t_mou_9', 'loc_og_t2m_mou_6',
'loc_og_t2m_mou_7', 'loc_og_t2m_mou_8', 'loc_og_t2m_mou_9',
'loc_og_t2f_mou_6', 'loc_og_t2f_mou_7', 'loc_og_t2f_mou_8',
'loc_og_t2f_mou_9', 'loc_og_t2c_mou_6', 'loc_og_t2c_mou_7',
'loc_og_t2c_mou_8', 'loc_og_t2c_mou_9', 'loc_og_mou_6',
'loc_og_mou_7', 'loc_og_mou_8', 'loc_og_mou_9', 'std_og_t2t_mou_6',
'std_og_t2t_mou_7', 'std_og_t2t_mou_8', 'std_og_t2t_mou_9',
'std_og_t2m_mou_6', 'std_og_t2m_mou_7', 'std_og_t2m_mou_8',
'std_og_t2m_mou_9', 'std_og_t2f_mou_6', 'std_og_t2f_mou_7',
'std_og_t2f_mou_8', 'std_og_t2f_mou_9', 'std_og_t2c_mou_6',
'std_og_t2c_mou_7', 'std_og_t2c_mou_8', 'std_og_t2c_mou_9',
'std_og_mou_6', 'std_og_mou_7', 'std_og_mou_8', 'std_og_mou_9',
'isd_og_mou_6', 'isd_og_mou_7', 'isd_og_mou_8', 'isd_og_mou_9',
'spl_og_mou_6', 'spl_og_mou_7', 'spl_og_mou_8', 'spl_og_mou_9',
'og_others_6', 'og_others_7', 'og_others_8', 'og_others_9',
'total_og_mou_6', 'total_og_mou_7', 'total_og_mou_8',
'total_og_mou_9', 'loc_ic_t2t_mou_6', 'loc_ic_t2t_mou_7',
'loc_ic_t2t_mou_8', 'loc_ic_t2t_mou_9', 'loc_ic_t2m_mou_6',
'loc_ic_t2m_mou_7', 'loc_ic_t2m_mou_8', 'loc_ic_t2m_mou_9',
'loc_ic_t2f_mou_6', 'loc_ic_t2f_mou_7', 'loc_ic_t2f_mou_8',
'loc_ic_t2f_mou_9', 'loc_ic_mou_6', 'loc_ic_mou_7', 'loc_ic_mou_8',
'loc_ic_mou_9', 'std_ic_t2t_mou_6', 'std_ic_t2t_mou_7',
'std_ic_t2t_mou_8', 'std_ic_t2t_mou_9', 'std_ic_t2m_mou_6',
'std_ic_t2m_mou_7', 'std_ic_t2m_mou_8', 'std_ic_t2m_mou_9',
'std_ic_t2f_mou_6', 'std_ic_t2f_mou_7', 'std_ic_t2f_mou_8',
'std_ic_t2f_mou_9', 'std_ic_t2o_mou_6', 'std_ic_t2o_mou_7',
'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9', 'std_ic_mou_6',
'std_ic_mou_7', 'std_ic_mou_8', 'std_ic_mou_9', 'total_ic_mou_6',
'total_ic_mou_7', 'total_ic_mou_8', 'total_ic_mou_9',
'spl_ic_mou_6', 'spl_ic_mou_7', 'spl_ic_mou_8', 'spl_ic_mou_9',
'isd_ic_mou_6', 'isd_ic_mou_7', 'isd_ic_mou_8', 'isd_ic_mou_9',
'ic_others_6', 'ic_others_7', 'ic_others_8', 'ic_others_9',
'total_rech_num_6', 'total_rech_num_7', 'total_rech_num_8',
'total_rech_num_9', 'total_rech_amt_6', 'total_rech_amt_7',
'total_rech_amt_8', 'total_rech_amt_9', 'max_rech_amt_6',
'max_rech_amt_7', 'max_rech_amt_8', 'max_rech_amt_9',
'date_of_last_rech_6', 'date_of_last_rech_7',
'date_of_last_rech_8', 'date_of_last_rech_9', 'last_day_rch_amt_6',
'last_day_rch_amt_7', 'last_day_rch_amt_8', 'last_day_rch_amt_9',
'date_of_last_rech_data_6', 'date_of_last_rech_data_7',
'date_of_last_rech_data_8', 'date_of_last_rech_data_9',
'total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8',
'total_rech_data_9', 'max_rech_data_6', 'max_rech_data_7',
'max_rech_data_8', 'max_rech_data_9', 'count_rech_2g_6',
'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_2g_9',
'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8',
'count_rech_3g_9', 'av_rech_amt_data_6', 'av_rech_amt_data_7',
'av_rech_amt_data_8', 'av_rech_amt_data_9', 'vol_2g_mb_6',
'vol_2g_mb_7', 'vol_2g_mb_8', 'vol_2g_mb_9', 'vol_3g_mb_6',
'vol_3g_mb_7', 'vol_3g_mb_8', 'vol_3g_mb_9', 'arpu_3g_6',
'arpu_3g_7', 'arpu_3g_8', 'arpu_3g_9', 'arpu_2g_6', 'arpu_2g_7',
'arpu_2g_8', 'arpu_2g_9', 'night_pck_user_6', 'night_pck_user_7',
'night_pck_user_8', 'night_pck_user_9', 'monthly_2g_6',
'monthly_2g_7', 'monthly_2g_8', 'monthly_2g_9', 'sachet_2g_6',
'sachet_2g_7', 'sachet_2g_8', 'sachet_2g_9', 'monthly_3g_6',
'monthly_3g_7', 'monthly_3g_8', 'monthly_3g_9', 'sachet_3g_6',
'sachet_3g_7', 'sachet_3g_8', 'sachet_3g_9', 'fb_user_6',
'fb_user_7', 'fb_user_8', 'fb_user_9', 'aon', 'aug_vbc_3g',
'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g'], dtype=object)
#grouping columns based on datatypes
churn.columns.to_series().groupby(churn.dtypes).groups
{int64: ['mobile_number', 'circle_id', 'total_rech_num_6', 'total_rech_num_7', 'total_rech_num_8', 'total_rech_num_9', 'total_rech_amt_6', 'total_rech_amt_7', 'total_rech_amt_8', 'total_rech_amt_9', 'max_rech_amt_6', 'max_rech_amt_7', 'max_rech_amt_8', 'max_rech_amt_9', 'last_day_rch_amt_6', 'last_day_rch_amt_7', 'last_day_rch_amt_8', 'last_day_rch_amt_9', 'monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8', 'monthly_2g_9', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8', 'sachet_2g_9', 'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8', 'monthly_3g_9', 'sachet_3g_6', 'sachet_3g_7', 'sachet_3g_8', 'sachet_3g_9', 'aon'], float64: ['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6', 'arpu_7', 'arpu_8', 'arpu_9', 'onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8', 'onnet_mou_9', 'offnet_mou_6', 'offnet_mou_7', 'offnet_mou_8', 'offnet_mou_9', 'roam_ic_mou_6', 'roam_ic_mou_7', 'roam_ic_mou_8', 'roam_ic_mou_9', 'roam_og_mou_6', 'roam_og_mou_7', 'roam_og_mou_8', 'roam_og_mou_9', 'loc_og_t2t_mou_6', 'loc_og_t2t_mou_7', 'loc_og_t2t_mou_8', 'loc_og_t2t_mou_9', 'loc_og_t2m_mou_6', 'loc_og_t2m_mou_7', 'loc_og_t2m_mou_8', 'loc_og_t2m_mou_9', 'loc_og_t2f_mou_6', 'loc_og_t2f_mou_7', 'loc_og_t2f_mou_8', 'loc_og_t2f_mou_9', 'loc_og_t2c_mou_6', 'loc_og_t2c_mou_7', 'loc_og_t2c_mou_8', 'loc_og_t2c_mou_9', 'loc_og_mou_6', 'loc_og_mou_7', 'loc_og_mou_8', 'loc_og_mou_9', 'std_og_t2t_mou_6', 'std_og_t2t_mou_7', 'std_og_t2t_mou_8', 'std_og_t2t_mou_9', 'std_og_t2m_mou_6', 'std_og_t2m_mou_7', 'std_og_t2m_mou_8', 'std_og_t2m_mou_9', 'std_og_t2f_mou_6', 'std_og_t2f_mou_7', 'std_og_t2f_mou_8', 'std_og_t2f_mou_9', 'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8', 'std_og_t2c_mou_9', 'std_og_mou_6', 'std_og_mou_7', 'std_og_mou_8', 'std_og_mou_9', 'isd_og_mou_6', 'isd_og_mou_7', 'isd_og_mou_8', 'isd_og_mou_9', 'spl_og_mou_6', 'spl_og_mou_7', 'spl_og_mou_8', 'spl_og_mou_9', 'og_others_6', 'og_others_7', 'og_others_8', 'og_others_9', 'total_og_mou_6', 'total_og_mou_7', 'total_og_mou_8', 'total_og_mou_9', 'loc_ic_t2t_mou_6', 'loc_ic_t2t_mou_7', 'loc_ic_t2t_mou_8', 'loc_ic_t2t_mou_9', 'loc_ic_t2m_mou_6', 'loc_ic_t2m_mou_7', 'loc_ic_t2m_mou_8', 'loc_ic_t2m_mou_9', 'loc_ic_t2f_mou_6', 'loc_ic_t2f_mou_7', 'loc_ic_t2f_mou_8', 'loc_ic_t2f_mou_9', 'loc_ic_mou_6', 'loc_ic_mou_7', 'loc_ic_mou_8', 'loc_ic_mou_9', 'std_ic_t2t_mou_6', 'std_ic_t2t_mou_7', 'std_ic_t2t_mou_8', 'std_ic_t2t_mou_9', 'std_ic_t2m_mou_6', ...], object: ['last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8', 'last_date_of_month_9', 'date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8', 'date_of_last_rech_9', 'date_of_last_rech_data_6', 'date_of_last_rech_data_7', 'date_of_last_rech_data_8', 'date_of_last_rech_data_9']}
#Converting date columns from object to datetime
date_columns=churn[['last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8',
'last_date_of_month_9', 'date_of_last_rech_6', 'date_of_last_rech_7',
'date_of_last_rech_8', 'date_of_last_rech_9',
'date_of_last_rech_data_6', 'date_of_last_rech_data_7',
'date_of_last_rech_data_8', 'date_of_last_rech_data_9']]
date_columns.head()
| last_date_of_month_6 | last_date_of_month_7 | last_date_of_month_8 | last_date_of_month_9 | date_of_last_rech_6 | date_of_last_rech_7 | date_of_last_rech_8 | date_of_last_rech_9 | date_of_last_rech_data_6 | date_of_last_rech_data_7 | date_of_last_rech_data_8 | date_of_last_rech_data_9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 21/06/2014 0:00 | 16/07/2014 0:00 | 08/08/2014 0:00 | 28/09/2014 0:00 | 21/06/2014 0:00 | 16/07/2014 0:00 | 08/08/2014 0:00 | NaN |
| 1 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 29/06/2014 0:00 | 31/07/2014 0:00 | 28/08/2014 0:00 | 30/09/2014 0:00 | NaN | 25/07/2014 0:00 | 10/08/2014 0:00 | NaN |
| 2 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 17/06/2014 0:00 | 24/07/2014 0:00 | 14/08/2014 0:00 | 29/09/2014 0:00 | NaN | NaN | NaN | 17/09/2014 0:00 |
| 3 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 28/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | NaN | NaN | NaN | NaN |
| 4 | 30/06/2014 0:00 | 31/07/2014 0:00 | 31/08/2014 0:00 | 30/09/2014 0:00 | 26/06/2014 0:00 | 28/07/2014 0:00 | 09/08/2014 0:00 | 28/09/2014 0:00 | 04/06/2014 0:00 | NaN | NaN | NaN |
# unique values in date columns
print(date_columns.nunique())
#converting date columns to datetype format
date_columns=date_columns.astype('datetime64[ns]')
last_date_of_month_6 1 last_date_of_month_7 1 last_date_of_month_8 1 last_date_of_month_9 1 date_of_last_rech_6 30 date_of_last_rech_7 31 date_of_last_rech_8 31 date_of_last_rech_9 30 date_of_last_rech_data_6 30 date_of_last_rech_data_7 31 date_of_last_rech_data_8 31 date_of_last_rech_data_9 30 dtype: int64
print(date_columns.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 99999 entries, 0 to 99998 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 last_date_of_month_6 99999 non-null datetime64[ns] 1 last_date_of_month_7 99398 non-null datetime64[ns] 2 last_date_of_month_8 98899 non-null datetime64[ns] 3 last_date_of_month_9 98340 non-null datetime64[ns] 4 date_of_last_rech_6 98392 non-null datetime64[ns] 5 date_of_last_rech_7 98232 non-null datetime64[ns] 6 date_of_last_rech_8 96377 non-null datetime64[ns] 7 date_of_last_rech_9 95239 non-null datetime64[ns] 8 date_of_last_rech_data_6 25153 non-null datetime64[ns] 9 date_of_last_rech_data_7 25571 non-null datetime64[ns] 10 date_of_last_rech_data_8 26339 non-null datetime64[ns] 11 date_of_last_rech_data_9 25922 non-null datetime64[ns] dtypes: datetime64[ns](12) memory usage: 9.2 MB None
#Filtering Day from date of recharge columns
date_columns['day_of_last_rech_6'] = date_columns.date_of_last_rech_6.dt.day
date_columns['day_of_last_rech_7'] = date_columns.date_of_last_rech_7.dt.day
date_columns['day_of_last_rech_8'] = date_columns.date_of_last_rech_8.dt.day
#Imputing missing day fields with 0
date_columns.loc[:,'day_of_last_rech_6']=date_columns['day_of_last_rech_6'].fillna('0')
date_columns.loc[:,'day_of_last_rech_7']=date_columns['day_of_last_rech_7'].fillna('0')
date_columns.loc[:,'day_of_last_rech_8']=date_columns['day_of_last_rech_8'].fillna('0')
#Dropping last date of month columns as it contains only one value and variance is low
#Dropping Date of last recharge columns as day is extracted and is not useful to us anymore
#Dropping date of lsat recharge data fields as the missing values in > 70 %
#Dropping all columns with suffix _9 as they will not be used
date_columns=date_columns.drop(['date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8',
'date_of_last_rech_9','last_date_of_month_6','last_date_of_month_7',
'last_date_of_month_8','last_date_of_month_9','date_of_last_rech_data_6',
'date_of_last_rech_data_7','date_of_last_rech_data_8','date_of_last_rech_data_9']
,axis=1)
print(round(100*(date_columns.isnull().sum()/len(date_columns.index)),2))
#Final date_Columns
date_columns.head()
day_of_last_rech_6 0.0 day_of_last_rech_7 0.0 day_of_last_rech_8 0.0 dtype: float64
| day_of_last_rech_6 | day_of_last_rech_7 | day_of_last_rech_8 | |
|---|---|---|---|
| 0 | 21.0 | 16.0 | 8.0 |
| 1 | 29.0 | 31.0 | 28.0 |
| 2 | 17.0 | 24.0 | 14.0 |
| 3 | 28.0 | 31.0 | 31.0 |
| 4 | 26.0 | 28.0 | 9.0 |
int_columns=churn[['mobile_number', 'circle_id', 'total_rech_num_6', 'total_rech_num_7',
'total_rech_num_8', 'total_rech_num_9', 'total_rech_amt_6',
'total_rech_amt_7', 'total_rech_amt_8', 'total_rech_amt_9',
'max_rech_amt_6', 'max_rech_amt_7', 'max_rech_amt_8', 'max_rech_amt_9',
'last_day_rch_amt_6', 'last_day_rch_amt_7', 'last_day_rch_amt_8',
'last_day_rch_amt_9', 'monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8',
'monthly_2g_9', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8',
'sachet_2g_9', 'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8',
'monthly_3g_9', 'sachet_3g_6', 'sachet_3g_7', 'sachet_3g_8',
'sachet_3g_9', 'aon']]
int_columns.head()
| mobile_number | circle_id | total_rech_num_6 | total_rech_num_7 | total_rech_num_8 | total_rech_num_9 | total_rech_amt_6 | total_rech_amt_7 | total_rech_amt_8 | total_rech_amt_9 | ... | sachet_2g_9 | monthly_3g_6 | monthly_3g_7 | monthly_3g_8 | monthly_3g_9 | sachet_3g_6 | sachet_3g_7 | sachet_3g_8 | sachet_3g_9 | aon | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7000842753 | 109 | 4 | 3 | 2 | 6 | 362 | 252 | 252 | 0 | ... | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 968 |
| 1 | 7001865778 | 109 | 4 | 9 | 11 | 5 | 74 | 384 | 283 | 121 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1006 |
| 2 | 7001625959 | 109 | 5 | 4 | 2 | 7 | 168 | 315 | 116 | 358 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1103 |
| 3 | 7001204172 | 109 | 10 | 11 | 18 | 14 | 230 | 310 | 601 | 410 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2491 |
| 4 | 7000142493 | 109 | 5 | 6 | 3 | 4 | 196 | 350 | 287 | 200 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1526 |
5 rows × 35 columns
print(int_columns.nunique()) # Count of unique values in int columns
mobile_number 99999 circle_id 1 total_rech_num_6 102 total_rech_num_7 101 total_rech_num_8 96 total_rech_num_9 97 total_rech_amt_6 2305 total_rech_amt_7 2329 total_rech_amt_8 2347 total_rech_amt_9 2304 max_rech_amt_6 202 max_rech_amt_7 183 max_rech_amt_8 213 max_rech_amt_9 201 last_day_rch_amt_6 186 last_day_rch_amt_7 173 last_day_rch_amt_8 199 last_day_rch_amt_9 185 monthly_2g_6 5 monthly_2g_7 6 monthly_2g_8 6 monthly_2g_9 5 sachet_2g_6 32 sachet_2g_7 35 sachet_2g_8 34 sachet_2g_9 32 monthly_3g_6 12 monthly_3g_7 15 monthly_3g_8 12 monthly_3g_9 11 sachet_3g_6 25 sachet_3g_7 27 sachet_3g_8 29 sachet_3g_9 27 aon 3489 dtype: int64
int_columns[['monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8',
'monthly_2g_9', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8',
'sachet_2g_9', 'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8',
'monthly_3g_9', 'sachet_3g_6', 'sachet_3g_7', 'sachet_3g_8',
'sachet_3g_9']].describe()
| monthly_2g_6 | monthly_2g_7 | monthly_2g_8 | monthly_2g_9 | sachet_2g_6 | sachet_2g_7 | sachet_2g_8 | sachet_2g_9 | monthly_3g_6 | monthly_3g_7 | monthly_3g_8 | monthly_3g_9 | sachet_3g_6 | sachet_3g_7 | sachet_3g_8 | sachet_3g_9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 | 99999.000000 |
| mean | 0.079641 | 0.083221 | 0.081001 | 0.068781 | 0.389384 | 0.439634 | 0.450075 | 0.393104 | 0.075921 | 0.078581 | 0.082941 | 0.086341 | 0.074781 | 0.080401 | 0.084501 | 0.084581 |
| std | 0.295058 | 0.304395 | 0.299568 | 0.278120 | 1.497320 | 1.636230 | 1.630263 | 1.347140 | 0.363371 | 0.387231 | 0.384947 | 0.384978 | 0.568344 | 0.628334 | 0.660234 | 0.650457 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 4.000000 | 5.000000 | 5.000000 | 4.000000 | 42.000000 | 48.000000 | 44.000000 | 40.000000 | 14.000000 | 16.000000 | 16.000000 | 11.000000 | 29.000000 | 35.000000 | 41.000000 | 49.000000 |
# Conversion of the number of months by 12
int_columns.loc[:,'aon_years']=int_columns.aon/365
int_columns.loc[:,'aon_years'] = int_columns['aon_years'].astype(int)
int_columns.head()
| mobile_number | circle_id | total_rech_num_6 | total_rech_num_7 | total_rech_num_8 | total_rech_num_9 | total_rech_amt_6 | total_rech_amt_7 | total_rech_amt_8 | total_rech_amt_9 | ... | monthly_3g_6 | monthly_3g_7 | monthly_3g_8 | monthly_3g_9 | sachet_3g_6 | sachet_3g_7 | sachet_3g_8 | sachet_3g_9 | aon | aon_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7000842753 | 109 | 4 | 3 | 2 | 6 | 362 | 252 | 252 | 0 | ... | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 968 | 2.0 |
| 1 | 7001865778 | 109 | 4 | 9 | 11 | 5 | 74 | 384 | 283 | 121 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1006 | 2.0 |
| 2 | 7001625959 | 109 | 5 | 4 | 2 | 7 | 168 | 315 | 116 | 358 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1103 | 3.0 |
| 3 | 7001204172 | 109 | 10 | 11 | 18 | 14 | 230 | 310 | 601 | 410 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2491 | 6.0 |
| 4 | 7000142493 | 109 | 5 | 6 | 3 | 4 | 196 | 350 | 287 | 200 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1526 | 4.0 |
5 rows × 36 columns
#Deriving New columns for Good Phase(6 and 7 months)
#Sachet subscription
int_columns.loc[:,'sachet_2g_good_ph'] = (int_columns.sachet_2g_6 + int_columns.sachet_2g_7)/2
int_columns.loc[:,'sachet_3g_good_ph'] = (int_columns.sachet_3g_6 + int_columns.sachet_3g_7)/2
#Monthly 2G/3G
int_columns.loc[:,'monthly_2g_good_ph'] = (int_columns.monthly_2g_6 + int_columns.monthly_2g_7)/2
int_columns.loc[:,'monthly_3g_good_ph'] = (int_columns.monthly_3g_6 + int_columns.monthly_3g_7)/2
#max_recaharge amount
int_columns.loc[:,'max_rech_amt_good_ph'] = (int_columns.max_rech_amt_6 + int_columns.max_rech_amt_7)/2
int_columns.loc[:,'total_rech_num_good_ph'] = (int_columns.total_rech_num_6 + int_columns.total_rech_num_7)/2
# Average total recharge for first 2 months
int_columns.loc[:,'tot_rech_amt_good_ph'] = (int_columns.total_rech_amt_6 + int_columns.total_rech_amt_7)/2
#df['avg_tot_rech_3mths'] = (df['total_rech_amt_6']+df['total_rech_amt_7']+df['total_rech_amt_8'])/3
int_columns.loc[:, 'last_day_rch_amt_good_ph'] = (int_columns.last_day_rch_amt_6 + int_columns.last_day_rch_amt_7)/2
#Dropping circle_id column as it contains only one value and variance is low
#Dropping aon column as new column is derived and is no longer useful
#Dropping all columns with suffix _6 and _7 as new columns derived
#Dropping all columns with suffix _9 as they will not be used
int_columns=int_columns.drop(['sachet_2g_6','sachet_2g_7','sachet_3g_6',
'sachet_3g_7','monthly_2g_6','monthly_2g_7',
'monthly_3g_6','monthly_3g_7','max_rech_amt_6',
'max_rech_amt_7','total_rech_num_6','total_rech_num_7','total_rech_amt_6','total_rech_amt_7',
'aon','circle_id','last_day_rch_amt_6','last_day_rch_amt_7','total_rech_num_9',
'total_rech_amt_9','max_rech_amt_9','last_day_rch_amt_9','monthly_2g_9','sachet_2g_9'
,'sachet_3g_9','monthly_3g_9']
,axis=1)
# final columns
int_columns.columns.values
array(['mobile_number', 'total_rech_num_8', 'total_rech_amt_8',
'max_rech_amt_8', 'last_day_rch_amt_8', 'monthly_2g_8',
'sachet_2g_8', 'monthly_3g_8', 'sachet_3g_8', 'aon_years',
'sachet_2g_good_ph', 'sachet_3g_good_ph', 'monthly_2g_good_ph',
'monthly_3g_good_ph', 'max_rech_amt_good_ph',
'total_rech_num_good_ph', 'tot_rech_amt_good_ph',
'last_day_rch_amt_good_ph'], dtype=object)
#groupingcolumnsbasedondatatypes
#df.columns[colforcoldf.dtypes=='float64']
list(churn.loc[:,churn.dtypes==float])
float_columns=churn[['loc_og_t2o_mou','std_og_t2o_mou','loc_ic_t2o_mou',
'arpu_6','arpu_7','arpu_8','arpu_9',
'onnet_mou_6','onnet_mou_7','onnet_mou_8','onnet_mou_9',
'offnet_mou_6','offnet_mou_7','offnet_mou_8','offnet_mou_9',
'roam_ic_mou_6','roam_ic_mou_7','roam_ic_mou_8','roam_ic_mou_9',
'roam_og_mou_6','roam_og_mou_7','roam_og_mou_8','roam_og_mou_9',
'loc_og_t2t_mou_6','loc_og_t2t_mou_7','loc_og_t2t_mou_8','loc_og_t2t_mou_9',
'loc_og_t2m_mou_6','loc_og_t2m_mou_7','loc_og_t2m_mou_8','loc_og_t2m_mou_9',
'loc_og_t2f_mou_6','loc_og_t2f_mou_7','loc_og_t2f_mou_8','loc_og_t2f_mou_9',
'loc_og_t2c_mou_6','loc_og_t2c_mou_7','loc_og_t2c_mou_8','loc_og_t2c_mou_9',
'loc_og_mou_6','loc_og_mou_7','loc_og_mou_8','loc_og_mou_9',
'std_og_t2t_mou_6','std_og_t2t_mou_7','std_og_t2t_mou_8','std_og_t2t_mou_9',
'std_og_t2m_mou_6','std_og_t2m_mou_7','std_og_t2m_mou_8','std_og_t2m_mou_9',
'std_og_t2f_mou_6','std_og_t2f_mou_7','std_og_t2f_mou_8','std_og_t2f_mou_9',
'std_og_t2c_mou_6','std_og_t2c_mou_7','std_og_t2c_mou_8','std_og_t2c_mou_9',
'std_og_mou_6','std_og_mou_7','std_og_mou_8','std_og_mou_9','isd_og_mou_6',
'isd_og_mou_7','isd_og_mou_8','isd_og_mou_9',
'spl_og_mou_6','spl_og_mou_7','spl_og_mou_8','spl_og_mou_9',
'og_others_6','og_others_7','og_others_8','og_others_9',
'total_og_mou_6','total_og_mou_7','total_og_mou_8','total_og_mou_9',
'loc_ic_t2t_mou_6','loc_ic_t2t_mou_7','loc_ic_t2t_mou_8','loc_ic_t2t_mou_9',
'loc_ic_t2m_mou_6','loc_ic_t2m_mou_7','loc_ic_t2m_mou_8','loc_ic_t2m_mou_9',
'loc_ic_t2f_mou_6','loc_ic_t2f_mou_7','loc_ic_t2f_mou_8','loc_ic_t2f_mou_9',
'loc_ic_mou_6','loc_ic_mou_7','loc_ic_mou_8','loc_ic_mou_9',
'std_ic_t2t_mou_6','std_ic_t2t_mou_7','std_ic_t2t_mou_8','std_ic_t2t_mou_9',
'std_ic_t2m_mou_6','std_ic_t2m_mou_7','std_ic_t2m_mou_8','std_ic_t2m_mou_9',
'std_ic_t2f_mou_6','std_ic_t2f_mou_7','std_ic_t2f_mou_8','std_ic_t2f_mou_9',
'std_ic_t2o_mou_6','std_ic_t2o_mou_7','std_ic_t2o_mou_8','std_ic_t2o_mou_9',
'std_ic_mou_6','std_ic_mou_7','std_ic_mou_8','std_ic_mou_9',
'total_ic_mou_6','total_ic_mou_7','total_ic_mou_8','total_ic_mou_9',
'spl_ic_mou_6','spl_ic_mou_7','spl_ic_mou_8','spl_ic_mou_9',
'isd_ic_mou_6','isd_ic_mou_7','isd_ic_mou_8','isd_ic_mou_9',
'ic_others_6','ic_others_7','ic_others_8','ic_others_9',
'total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9',
'max_rech_data_6','max_rech_data_7','max_rech_data_8','max_rech_data_9',
'count_rech_2g_6','count_rech_2g_7','count_rech_2g_8','count_rech_2g_9',
'count_rech_3g_6','count_rech_3g_7','count_rech_3g_8','count_rech_3g_9',
'av_rech_amt_data_6','av_rech_amt_data_7','av_rech_amt_data_8','av_rech_amt_data_9'
,'vol_2g_mb_6','vol_2g_mb_7','vol_2g_mb_8','vol_2g_mb_9',
'vol_3g_mb_6','vol_3g_mb_7','vol_3g_mb_8','vol_3g_mb_9','arpu_3g_6','arpu_3g_7','arpu_3g_8','arpu_3g_9','arpu_2g_6','arpu_2g_7','arpu_2g_8','arpu_2g_9','night_pck_user_6','night_pck_user_7','night_pck_user_8','night_pck_user_9','fb_user_6','fb_user_7','fb_user_8','fb_user_9','aug_vbc_3g','jul_vbc_3g','jun_vbc_3g','sep_vbc_3g']]
#Checking for null values
float_null= round(100*(float_columns.isnull().sum()/len(float_columns.index)),2)
float_null[float_null > 10]
total_rech_data_6 74.85 total_rech_data_7 74.43 total_rech_data_8 73.66 total_rech_data_9 74.08 max_rech_data_6 74.85 max_rech_data_7 74.43 max_rech_data_8 73.66 max_rech_data_9 74.08 count_rech_2g_6 74.85 count_rech_2g_7 74.43 count_rech_2g_8 73.66 count_rech_2g_9 74.08 count_rech_3g_6 74.85 count_rech_3g_7 74.43 count_rech_3g_8 73.66 count_rech_3g_9 74.08 av_rech_amt_data_6 74.85 av_rech_amt_data_7 74.43 av_rech_amt_data_8 73.66 av_rech_amt_data_9 74.08 arpu_3g_6 74.85 arpu_3g_7 74.43 arpu_3g_8 73.66 arpu_3g_9 74.08 arpu_2g_6 74.85 arpu_2g_7 74.43 arpu_2g_8 73.66 arpu_2g_9 74.08 night_pck_user_6 74.85 night_pck_user_7 74.43 night_pck_user_8 73.66 night_pck_user_9 74.08 fb_user_6 74.85 fb_user_7 74.43 fb_user_8 73.66 fb_user_9 74.08 dtype: float64
# Dropping the above columns with more than 70% missing values
float_columns=float_columns.drop(['total_rech_data_6','total_rech_data_7','total_rech_data_8','total_rech_data_9',
'max_rech_data_6','max_rech_data_7','max_rech_data_8','max_rech_data_9','count_rech_2g_6',
'count_rech_2g_7','count_rech_2g_8','count_rech_2g_9','count_rech_3g_6','count_rech_3g_7',
'count_rech_3g_8','count_rech_3g_9','av_rech_amt_data_6','av_rech_amt_data_7',
'av_rech_amt_data_8','av_rech_amt_data_9','arpu_3g_6','arpu_3g_7','arpu_3g_8','arpu_3g_9',
'arpu_2g_6','arpu_2g_7','arpu_2g_8','arpu_2g_9','night_pck_user_6','night_pck_user_7',
'night_pck_user_8','night_pck_user_9','fb_user_6','fb_user_7','fb_user_8','fb_user_9'],axis=1)
#Remaining Columns for analysis
float_columns.columns.values
array(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6',
'arpu_7', 'arpu_8', 'arpu_9', 'onnet_mou_6', 'onnet_mou_7',
'onnet_mou_8', 'onnet_mou_9', 'offnet_mou_6', 'offnet_mou_7',
'offnet_mou_8', 'offnet_mou_9', 'roam_ic_mou_6', 'roam_ic_mou_7',
'roam_ic_mou_8', 'roam_ic_mou_9', 'roam_og_mou_6', 'roam_og_mou_7',
'roam_og_mou_8', 'roam_og_mou_9', 'loc_og_t2t_mou_6',
'loc_og_t2t_mou_7', 'loc_og_t2t_mou_8', 'loc_og_t2t_mou_9',
'loc_og_t2m_mou_6', 'loc_og_t2m_mou_7', 'loc_og_t2m_mou_8',
'loc_og_t2m_mou_9', 'loc_og_t2f_mou_6', 'loc_og_t2f_mou_7',
'loc_og_t2f_mou_8', 'loc_og_t2f_mou_9', 'loc_og_t2c_mou_6',
'loc_og_t2c_mou_7', 'loc_og_t2c_mou_8', 'loc_og_t2c_mou_9',
'loc_og_mou_6', 'loc_og_mou_7', 'loc_og_mou_8', 'loc_og_mou_9',
'std_og_t2t_mou_6', 'std_og_t2t_mou_7', 'std_og_t2t_mou_8',
'std_og_t2t_mou_9', 'std_og_t2m_mou_6', 'std_og_t2m_mou_7',
'std_og_t2m_mou_8', 'std_og_t2m_mou_9', 'std_og_t2f_mou_6',
'std_og_t2f_mou_7', 'std_og_t2f_mou_8', 'std_og_t2f_mou_9',
'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8',
'std_og_t2c_mou_9', 'std_og_mou_6', 'std_og_mou_7', 'std_og_mou_8',
'std_og_mou_9', 'isd_og_mou_6', 'isd_og_mou_7', 'isd_og_mou_8',
'isd_og_mou_9', 'spl_og_mou_6', 'spl_og_mou_7', 'spl_og_mou_8',
'spl_og_mou_9', 'og_others_6', 'og_others_7', 'og_others_8',
'og_others_9', 'total_og_mou_6', 'total_og_mou_7',
'total_og_mou_8', 'total_og_mou_9', 'loc_ic_t2t_mou_6',
'loc_ic_t2t_mou_7', 'loc_ic_t2t_mou_8', 'loc_ic_t2t_mou_9',
'loc_ic_t2m_mou_6', 'loc_ic_t2m_mou_7', 'loc_ic_t2m_mou_8',
'loc_ic_t2m_mou_9', 'loc_ic_t2f_mou_6', 'loc_ic_t2f_mou_7',
'loc_ic_t2f_mou_8', 'loc_ic_t2f_mou_9', 'loc_ic_mou_6',
'loc_ic_mou_7', 'loc_ic_mou_8', 'loc_ic_mou_9', 'std_ic_t2t_mou_6',
'std_ic_t2t_mou_7', 'std_ic_t2t_mou_8', 'std_ic_t2t_mou_9',
'std_ic_t2m_mou_6', 'std_ic_t2m_mou_7', 'std_ic_t2m_mou_8',
'std_ic_t2m_mou_9', 'std_ic_t2f_mou_6', 'std_ic_t2f_mou_7',
'std_ic_t2f_mou_8', 'std_ic_t2f_mou_9', 'std_ic_t2o_mou_6',
'std_ic_t2o_mou_7', 'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9',
'std_ic_mou_6', 'std_ic_mou_7', 'std_ic_mou_8', 'std_ic_mou_9',
'total_ic_mou_6', 'total_ic_mou_7', 'total_ic_mou_8',
'total_ic_mou_9', 'spl_ic_mou_6', 'spl_ic_mou_7', 'spl_ic_mou_8',
'spl_ic_mou_9', 'isd_ic_mou_6', 'isd_ic_mou_7', 'isd_ic_mou_8',
'isd_ic_mou_9', 'ic_others_6', 'ic_others_7', 'ic_others_8',
'ic_others_9', 'vol_2g_mb_6', 'vol_2g_mb_7', 'vol_2g_mb_8',
'vol_2g_mb_9', 'vol_3g_mb_6', 'vol_3g_mb_7', 'vol_3g_mb_8',
'vol_3g_mb_9', 'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g',
'sep_vbc_3g'], dtype=object)
# Checking for zero variance columns
float_columns.std()[float_columns.std()==0].index.values
array(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou',
'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8',
'std_og_t2c_mou_9', 'std_ic_t2o_mou_6', 'std_ic_t2o_mou_7',
'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9'], dtype=object)
#Dropping these variable
float_columns=float_columns.drop(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou',
'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8',
'std_og_t2c_mou_9', 'std_ic_t2o_mou_6', 'std_ic_t2o_mou_7',
'std_ic_t2o_mou_8', 'std_ic_t2o_mou_9'],axis=1)
#Remaining Columns for analysis
float_columns.columns.values
array(['arpu_6', 'arpu_7', 'arpu_8', 'arpu_9', 'onnet_mou_6',
'onnet_mou_7', 'onnet_mou_8', 'onnet_mou_9', 'offnet_mou_6',
'offnet_mou_7', 'offnet_mou_8', 'offnet_mou_9', 'roam_ic_mou_6',
'roam_ic_mou_7', 'roam_ic_mou_8', 'roam_ic_mou_9', 'roam_og_mou_6',
'roam_og_mou_7', 'roam_og_mou_8', 'roam_og_mou_9',
'loc_og_t2t_mou_6', 'loc_og_t2t_mou_7', 'loc_og_t2t_mou_8',
'loc_og_t2t_mou_9', 'loc_og_t2m_mou_6', 'loc_og_t2m_mou_7',
'loc_og_t2m_mou_8', 'loc_og_t2m_mou_9', 'loc_og_t2f_mou_6',
'loc_og_t2f_mou_7', 'loc_og_t2f_mou_8', 'loc_og_t2f_mou_9',
'loc_og_t2c_mou_6', 'loc_og_t2c_mou_7', 'loc_og_t2c_mou_8',
'loc_og_t2c_mou_9', 'loc_og_mou_6', 'loc_og_mou_7', 'loc_og_mou_8',
'loc_og_mou_9', 'std_og_t2t_mou_6', 'std_og_t2t_mou_7',
'std_og_t2t_mou_8', 'std_og_t2t_mou_9', 'std_og_t2m_mou_6',
'std_og_t2m_mou_7', 'std_og_t2m_mou_8', 'std_og_t2m_mou_9',
'std_og_t2f_mou_6', 'std_og_t2f_mou_7', 'std_og_t2f_mou_8',
'std_og_t2f_mou_9', 'std_og_mou_6', 'std_og_mou_7', 'std_og_mou_8',
'std_og_mou_9', 'isd_og_mou_6', 'isd_og_mou_7', 'isd_og_mou_8',
'isd_og_mou_9', 'spl_og_mou_6', 'spl_og_mou_7', 'spl_og_mou_8',
'spl_og_mou_9', 'og_others_6', 'og_others_7', 'og_others_8',
'og_others_9', 'total_og_mou_6', 'total_og_mou_7',
'total_og_mou_8', 'total_og_mou_9', 'loc_ic_t2t_mou_6',
'loc_ic_t2t_mou_7', 'loc_ic_t2t_mou_8', 'loc_ic_t2t_mou_9',
'loc_ic_t2m_mou_6', 'loc_ic_t2m_mou_7', 'loc_ic_t2m_mou_8',
'loc_ic_t2m_mou_9', 'loc_ic_t2f_mou_6', 'loc_ic_t2f_mou_7',
'loc_ic_t2f_mou_8', 'loc_ic_t2f_mou_9', 'loc_ic_mou_6',
'loc_ic_mou_7', 'loc_ic_mou_8', 'loc_ic_mou_9', 'std_ic_t2t_mou_6',
'std_ic_t2t_mou_7', 'std_ic_t2t_mou_8', 'std_ic_t2t_mou_9',
'std_ic_t2m_mou_6', 'std_ic_t2m_mou_7', 'std_ic_t2m_mou_8',
'std_ic_t2m_mou_9', 'std_ic_t2f_mou_6', 'std_ic_t2f_mou_7',
'std_ic_t2f_mou_8', 'std_ic_t2f_mou_9', 'std_ic_mou_6',
'std_ic_mou_7', 'std_ic_mou_8', 'std_ic_mou_9', 'total_ic_mou_6',
'total_ic_mou_7', 'total_ic_mou_8', 'total_ic_mou_9',
'spl_ic_mou_6', 'spl_ic_mou_7', 'spl_ic_mou_8', 'spl_ic_mou_9',
'isd_ic_mou_6', 'isd_ic_mou_7', 'isd_ic_mou_8', 'isd_ic_mou_9',
'ic_others_6', 'ic_others_7', 'ic_others_8', 'ic_others_9',
'vol_2g_mb_6', 'vol_2g_mb_7', 'vol_2g_mb_8', 'vol_2g_mb_9',
'vol_3g_mb_6', 'vol_3g_mb_7', 'vol_3g_mb_8', 'vol_3g_mb_9',
'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g'],
dtype=object)
#Checking for missing values
missing_float=round(100*(float_columns.isnull().sum()/len(float_columns.index)),2)
missing_float[missing_float > 0.1]
onnet_mou_6 3.94
onnet_mou_7 3.86
onnet_mou_8 5.38
onnet_mou_9 7.75
offnet_mou_6 3.94
...
isd_ic_mou_9 7.75
ic_others_6 3.94
ic_others_7 3.86
ic_others_8 5.38
ic_others_9 7.75
Length: 108, dtype: float64
#Imputing missing values
#Since the minutes of usage columns are all numerical, imputing missing values with zero
mou_cols = float_columns.columns[float_columns.columns.str.contains('mou')]
float_columns.loc[:,mou_cols] = float_columns.loc[:,mou_cols].replace(np.NaN,0)
missing_float=round(100*(float_columns.isnull().sum()/len(float_columns.index)),2)
missing_float[missing_float > 0.1]
og_others_6 3.94 og_others_7 3.86 og_others_8 5.38 og_others_9 7.75 ic_others_6 3.94 ic_others_7 3.86 ic_others_8 5.38 ic_others_9 7.75 dtype: float64
#Imputing missing values
others_cols = float_columns.columns[float_columns.columns.str.contains('others')]
float_columns.loc[:,others_cols] = float_columns.loc[:,others_cols].replace(np.NaN,0)
missing_float=round(100*(float_columns.isnull().sum()/len(float_columns.index)),2)
missing_float[missing_float > 0]
Series([], dtype: float64)
#Checking Incoming and Outgoing columns
ic_cols = float_columns.columns[float_columns.columns.str.contains('ic_mou_6')]
print(ic_cols)
float_columns.loc[:,ic_cols].head()
ic_all=float_columns.roam_ic_mou_6+ float_columns.loc_ic_mou_6+float_columns.std_ic_mou_6+ float_columns.spl_ic_mou_6+float_columns.isd_ic_mou_6
print(ic_all.head())
print(float_columns.total_ic_mou_6.head())
#Since the total incoming column is equal to the sum of other
Index(['roam_ic_mou_6', 'loc_ic_mou_6', 'std_ic_mou_6', 'total_ic_mou_6',
'spl_ic_mou_6', 'isd_ic_mou_6'],
dtype='object')
0 0.00
1 26.82
2 369.10
3 280.06
4 215.96
dtype: float64
0 0.00
1 26.83
2 370.04
3 280.08
4 216.44
Name: total_ic_mou_6, dtype: float64
## Deriving new features from existing columns
# Average of first 2 months
float_columns.loc[:,'arpu_good_ph'] = (float_columns['arpu_6']+float_columns['arpu_7'])/2
float_columns.loc[:,'spl_ic_mou_good_ph'] = (float_columns.spl_ic_mou_6 + float_columns.spl_ic_mou_7)/2
float_columns.loc[:,'isd_ic_mou_good_ph'] = (float_columns.isd_ic_mou_6 + float_columns.isd_ic_mou_7)/2
float_columns.loc[:,'roam_ic_mou_good_ph'] = (float_columns.roam_ic_mou_6 + float_columns.roam_ic_mou_7)/2
float_columns.loc[:,'std_ic_t2m_mou_good_ph'] = (float_columns.std_ic_t2m_mou_6 + float_columns.std_ic_t2m_mou_7)/2
float_columns.loc[:,'std_ic_t2t_mou_good_ph'] = (float_columns.std_ic_t2t_mou_6 + float_columns.std_ic_t2t_mou_7)/2
float_columns.loc[:,'std_ic_t2f_mou_good_ph'] = (float_columns.std_ic_t2f_mou_6 + float_columns.std_ic_t2f_mou_7)/2
float_columns.loc[:,'loc_ic_t2m_mou_good_ph'] = (float_columns.loc_ic_t2m_mou_6 + float_columns.loc_ic_t2m_mou_7)/2
float_columns.loc[:,'loc_ic_t2t_mou_good_ph'] = (float_columns.loc_ic_t2t_mou_6 + float_columns.loc_ic_t2t_mou_7)/2
float_columns.loc[:,'loc_ic_t2f_mou_good_ph'] = (float_columns.loc_ic_t2f_mou_6 + float_columns.loc_ic_t2f_mou_7)/2
float_columns.loc[:,'offnet_mou_good_ph'] = (float_columns.offnet_mou_6 + float_columns.offnet_mou_7)/2
float_columns.loc[:,'onnet_mou_good_ph'] = (float_columns.onnet_mou_6 + float_columns.onnet_mou_7)/2
float_columns.loc[:,'og_others_good_ph'] = (float_columns.og_others_6 + float_columns.og_others_7)/2
float_columns.loc[:,'ic_others_good_ph'] = (float_columns.ic_others_6 + float_columns.ic_others_7)/2
float_columns.loc[:,'spl_og_mou_good_ph'] = (float_columns.spl_og_mou_6 + float_columns.spl_og_mou_7)/2
float_columns.loc[:,'isd_og_mou_good_ph'] = (float_columns.isd_og_mou_6 + float_columns.isd_og_mou_7)/2
float_columns.loc[:,'roam_og_mou_good_ph'] = (float_columns.roam_og_mou_6 + float_columns.roam_og_mou_7)/2
float_columns.loc[:,'std_og_t2m_mou_good_ph'] = (float_columns.std_og_t2m_mou_6 + float_columns.std_og_t2m_mou_7)/2
float_columns.loc[:,'std_og_t2t_mou_good_ph'] = (float_columns.std_og_t2t_mou_6 + float_columns.std_og_t2t_mou_7)/2
float_columns.loc[:,'std_og_t2f_mou_good_ph'] = (float_columns.std_og_t2f_mou_6 + float_columns.std_og_t2f_mou_7)/2
float_columns.loc[:,'loc_og_t2m_mou_good_ph'] = (float_columns.loc_og_t2m_mou_6 + float_columns.loc_og_t2m_mou_7)/2
float_columns.loc[:,'loc_og_t2t_mou_good_ph'] = (float_columns.loc_og_t2t_mou_6 + float_columns.loc_og_t2t_mou_7)/2
float_columns.loc[:,'loc_og_t2f_mou_good_ph'] = (float_columns.loc_og_t2f_mou_6 + float_columns.loc_og_t2f_mou_7)/2
float_columns.loc[:,'loc_og_t2c_mou_good_ph'] = (float_columns.loc_og_t2c_mou_6 + float_columns.loc_og_t2c_mou_7)/2
float_columns.loc[:,'total_ic_mou_good_ph'] = (float_columns.total_ic_mou_6 + float_columns.total_ic_mou_7)/2
float_columns.loc[:,'total_og_mou_good_ph'] = (float_columns.total_og_mou_6 + float_columns.total_og_mou_7)/2
float_columns.loc[:,'vbc_3g_good_ph'] = (float_columns.jun_vbc_3g + float_columns.jul_vbc_3g)/2
float_columns.loc[:,'std_ic_mou_good_ph'] = (float_columns.std_ic_mou_6 + float_columns.std_ic_mou_7)/2
float_columns.loc[:,'loc_ic_mou_good_ph'] = (float_columns.loc_ic_mou_6 + float_columns.loc_ic_mou_7)/2
float_columns.loc[:,'std_og_mou_good_ph'] = (float_columns.std_og_mou_6 + float_columns.std_og_mou_7)/2
float_columns.loc[:,'loc_og_mou_good_ph'] = (float_columns.loc_og_mou_6 + float_columns.loc_og_mou_7)/2
float_columns.loc[:,'vol_2g_mb_good_ph'] = (float_columns.vol_2g_mb_6 + float_columns.vol_2g_mb_7)/2
float_columns.loc[:,'vol_3g_mb_good_ph'] = (float_columns.vol_3g_mb_6 + float_columns.vol_3g_mb_7)/2
#Grouping attributes needed to use to tag churners
usage_cols=['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']
def churn_calc(row):
return 1 if (row['total_ic_mou_9']==0) & (row['total_og_mou_9']==0) & (row['vol_2g_mb_9']==0) & (row['vol_3g_mb_9']==0) else 0
#Deriving Churner tags
float_columns['churn']=float_columns[usage_cols].apply(churn_calc,axis=1)
float_columns['churn'].value_counts()
churn 0 89808 1 10191 Name: count, dtype: int64
#We are calculating threshold based on the normal distribution to detect the drop in telecom usage.
def phase_calc(row):
#total_ic_mou_6
ic_std=np.std([row['total_ic_mou_6'],row['total_ic_mou_7']])
ic_mean=np.mean([row['total_ic_mou_6'],row['total_ic_mou_7']])
ic_threshold=ic_mean-ic_std
#total_og_mou_6
og_std=np.std([row['total_og_mou_6'],row['total_og_mou_7']])
og_mean=np.mean([row['total_og_mou_6'],row['total_og_mou_7']])
og_threshold=og_mean-og_std
#vol_2g_mb_6
two_g_std=np.std([row['vol_2g_mb_6'],row['vol_2g_mb_7']])
two_g_mean=np.mean([row['vol_2g_mb_6'],row['vol_2g_mb_7']])
two_g_threshold=two_g_mean-two_g_std
#vol_3g_mb_6
three_g_std=np.std([row['vol_3g_mb_6'],row['vol_3g_mb_7']])
three_g_mean=np.mean([row['vol_3g_mb_6'],row['vol_3g_mb_7']])
three_g_threshold=three_g_mean-three_g_std
if (row['total_ic_mou_8'] < ic_threshold) | (row['total_og_mou_8'] < og_threshold)\
| (row['vol_2g_mb_8'] < two_g_threshold) | (row['vol_3g_mb_8'] < three_g_threshold):
return 1
else:
return 0
float_columns['action_required']=float_columns.apply(phase_calc,axis=1)
float_columns['action_required'].value_counts()
action_required 1 51814 0 48185 Name: count, dtype: int64
# Dropping the _9 suffix variables
# Dropping the _6 and _7 suffix variables because we have derived the columns from these variables
float_columns=float_columns.drop(['spl_ic_mou_6','isd_ic_mou_6','roam_ic_mou_6','std_ic_t2m_mou_6','std_ic_t2t_mou_6',
'std_ic_t2f_mou_6','loc_ic_t2m_mou_6','loc_ic_t2t_mou_6','loc_ic_t2f_mou_6',
'offnet_mou_6','onnet_mou_6','og_others_6','ic_others_7','spl_og_mou_6','isd_og_mou_6',
'roam_og_mou_6','std_og_t2m_mou_6','std_og_t2t_mou_6','std_og_t2f_mou_6',
'loc_og_t2m_mou_6','loc_og_t2t_mou_6','loc_og_t2f_mou_6','loc_og_t2c_mou_6',
'total_ic_mou_6','total_og_mou_6','spl_ic_mou_7','isd_ic_mou_7','roam_ic_mou_7',
'std_ic_t2m_mou_7','std_ic_t2t_mou_7','std_ic_t2f_mou_7','loc_ic_t2m_mou_7',
'loc_ic_t2t_mou_7','loc_ic_t2f_mou_7','offnet_mou_7','onnet_mou_7','og_others_7',
'ic_others_6','spl_og_mou_7','isd_og_mou_7','roam_og_mou_7','std_og_t2m_mou_7',
'std_og_t2t_mou_7','std_og_t2f_mou_7','loc_og_t2m_mou_7','loc_og_t2t_mou_7',
'loc_og_t2f_mou_7','loc_og_t2c_mou_7','total_ic_mou_7','total_og_mou_7','arpu_9',
'onnet_mou_9', 'offnet_mou_9', 'roam_ic_mou_9','roam_og_mou_9', 'loc_og_t2t_mou_9',
'loc_og_t2m_mou_9','loc_og_t2f_mou_9', 'loc_og_t2c_mou_9', 'loc_og_mou_9',
'std_og_t2t_mou_9', 'std_og_t2m_mou_9', 'std_og_t2f_mou_9','std_og_mou_9', 'isd_og_mou_9',
'spl_og_mou_9', 'og_others_9','total_og_mou_9', 'loc_ic_t2t_mou_9', 'loc_ic_t2m_mou_9',
'loc_ic_t2f_mou_9', 'loc_ic_mou_9', 'std_ic_t2t_mou_9','std_ic_t2m_mou_9',
'std_ic_t2f_mou_9', 'std_ic_mou_9','vol_2g_mb_6', 'vol_2g_mb_7','vol_3g_mb_6', 'vol_3g_mb_7',
'total_ic_mou_9', 'spl_ic_mou_9', 'isd_ic_mou_9', 'ic_others_9','vol_2g_mb_9',
'vol_3g_mb_9','arpu_6', 'arpu_7','std_ic_mou_6','std_ic_mou_7',
'loc_ic_mou_6','loc_ic_mou_7','std_og_mou_6','std_og_mou_7','loc_og_mou_6',
'loc_og_mou_7','jul_vbc_3g', 'jun_vbc_3g','sep_vbc_3g'],axis=1)
#Final columns in dataframe 3 : float_columns
float_columns.columns.values
array(['arpu_8', 'onnet_mou_8', 'offnet_mou_8', 'roam_ic_mou_8',
'roam_og_mou_8', 'loc_og_t2t_mou_8', 'loc_og_t2m_mou_8',
'loc_og_t2f_mou_8', 'loc_og_t2c_mou_8', 'loc_og_mou_8',
'std_og_t2t_mou_8', 'std_og_t2m_mou_8', 'std_og_t2f_mou_8',
'std_og_mou_8', 'isd_og_mou_8', 'spl_og_mou_8', 'og_others_8',
'total_og_mou_8', 'loc_ic_t2t_mou_8', 'loc_ic_t2m_mou_8',
'loc_ic_t2f_mou_8', 'loc_ic_mou_8', 'std_ic_t2t_mou_8',
'std_ic_t2m_mou_8', 'std_ic_t2f_mou_8', 'std_ic_mou_8',
'total_ic_mou_8', 'spl_ic_mou_8', 'isd_ic_mou_8', 'ic_others_8',
'vol_2g_mb_8', 'vol_3g_mb_8', 'aug_vbc_3g', 'arpu_good_ph',
'spl_ic_mou_good_ph', 'isd_ic_mou_good_ph', 'roam_ic_mou_good_ph',
'std_ic_t2m_mou_good_ph', 'std_ic_t2t_mou_good_ph',
'std_ic_t2f_mou_good_ph', 'loc_ic_t2m_mou_good_ph',
'loc_ic_t2t_mou_good_ph', 'loc_ic_t2f_mou_good_ph',
'offnet_mou_good_ph', 'onnet_mou_good_ph', 'og_others_good_ph',
'ic_others_good_ph', 'spl_og_mou_good_ph', 'isd_og_mou_good_ph',
'roam_og_mou_good_ph', 'std_og_t2m_mou_good_ph',
'std_og_t2t_mou_good_ph', 'std_og_t2f_mou_good_ph',
'loc_og_t2m_mou_good_ph', 'loc_og_t2t_mou_good_ph',
'loc_og_t2f_mou_good_ph', 'loc_og_t2c_mou_good_ph',
'total_ic_mou_good_ph', 'total_og_mou_good_ph', 'vbc_3g_good_ph',
'std_ic_mou_good_ph', 'loc_ic_mou_good_ph', 'std_og_mou_good_ph',
'loc_og_mou_good_ph', 'vol_2g_mb_good_ph', 'vol_3g_mb_good_ph',
'churn', 'action_required'], dtype=object)
master_frame = pd.concat([int_columns,date_columns,float_columns], axis=1)
master_frame.columns.values
array(['mobile_number', 'total_rech_num_8', 'total_rech_amt_8',
'max_rech_amt_8', 'last_day_rch_amt_8', 'monthly_2g_8',
'sachet_2g_8', 'monthly_3g_8', 'sachet_3g_8', 'aon_years',
'sachet_2g_good_ph', 'sachet_3g_good_ph', 'monthly_2g_good_ph',
'monthly_3g_good_ph', 'max_rech_amt_good_ph',
'total_rech_num_good_ph', 'tot_rech_amt_good_ph',
'last_day_rch_amt_good_ph', 'day_of_last_rech_6',
'day_of_last_rech_7', 'day_of_last_rech_8', 'arpu_8',
'onnet_mou_8', 'offnet_mou_8', 'roam_ic_mou_8', 'roam_og_mou_8',
'loc_og_t2t_mou_8', 'loc_og_t2m_mou_8', 'loc_og_t2f_mou_8',
'loc_og_t2c_mou_8', 'loc_og_mou_8', 'std_og_t2t_mou_8',
'std_og_t2m_mou_8', 'std_og_t2f_mou_8', 'std_og_mou_8',
'isd_og_mou_8', 'spl_og_mou_8', 'og_others_8', 'total_og_mou_8',
'loc_ic_t2t_mou_8', 'loc_ic_t2m_mou_8', 'loc_ic_t2f_mou_8',
'loc_ic_mou_8', 'std_ic_t2t_mou_8', 'std_ic_t2m_mou_8',
'std_ic_t2f_mou_8', 'std_ic_mou_8', 'total_ic_mou_8',
'spl_ic_mou_8', 'isd_ic_mou_8', 'ic_others_8', 'vol_2g_mb_8',
'vol_3g_mb_8', 'aug_vbc_3g', 'arpu_good_ph', 'spl_ic_mou_good_ph',
'isd_ic_mou_good_ph', 'roam_ic_mou_good_ph',
'std_ic_t2m_mou_good_ph', 'std_ic_t2t_mou_good_ph',
'std_ic_t2f_mou_good_ph', 'loc_ic_t2m_mou_good_ph',
'loc_ic_t2t_mou_good_ph', 'loc_ic_t2f_mou_good_ph',
'offnet_mou_good_ph', 'onnet_mou_good_ph', 'og_others_good_ph',
'ic_others_good_ph', 'spl_og_mou_good_ph', 'isd_og_mou_good_ph',
'roam_og_mou_good_ph', 'std_og_t2m_mou_good_ph',
'std_og_t2t_mou_good_ph', 'std_og_t2f_mou_good_ph',
'loc_og_t2m_mou_good_ph', 'loc_og_t2t_mou_good_ph',
'loc_og_t2f_mou_good_ph', 'loc_og_t2c_mou_good_ph',
'total_ic_mou_good_ph', 'total_og_mou_good_ph', 'vbc_3g_good_ph',
'std_ic_mou_good_ph', 'loc_ic_mou_good_ph', 'std_og_mou_good_ph',
'loc_og_mou_good_ph', 'vol_2g_mb_good_ph', 'vol_3g_mb_good_ph',
'churn', 'action_required'], dtype=object)
print(master_frame.shape)
print(master_frame.info())
(99999, 89) <class 'pandas.core.frame.DataFrame'> RangeIndex: 99999 entries, 0 to 99998 Data columns (total 89 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mobile_number 99999 non-null int64 1 total_rech_num_8 99999 non-null int64 2 total_rech_amt_8 99999 non-null int64 3 max_rech_amt_8 99999 non-null int64 4 last_day_rch_amt_8 99999 non-null int64 5 monthly_2g_8 99999 non-null int64 6 sachet_2g_8 99999 non-null int64 7 monthly_3g_8 99999 non-null int64 8 sachet_3g_8 99999 non-null int64 9 aon_years 99999 non-null float64 10 sachet_2g_good_ph 99999 non-null float64 11 sachet_3g_good_ph 99999 non-null float64 12 monthly_2g_good_ph 99999 non-null float64 13 monthly_3g_good_ph 99999 non-null float64 14 max_rech_amt_good_ph 99999 non-null float64 15 total_rech_num_good_ph 99999 non-null float64 16 tot_rech_amt_good_ph 99999 non-null float64 17 last_day_rch_amt_good_ph 99999 non-null float64 18 day_of_last_rech_6 99999 non-null object 19 day_of_last_rech_7 99999 non-null object 20 day_of_last_rech_8 99999 non-null object 21 arpu_8 99999 non-null float64 22 onnet_mou_8 99999 non-null float64 23 offnet_mou_8 99999 non-null float64 24 roam_ic_mou_8 99999 non-null float64 25 roam_og_mou_8 99999 non-null float64 26 loc_og_t2t_mou_8 99999 non-null float64 27 loc_og_t2m_mou_8 99999 non-null float64 28 loc_og_t2f_mou_8 99999 non-null float64 29 loc_og_t2c_mou_8 99999 non-null float64 30 loc_og_mou_8 99999 non-null float64 31 std_og_t2t_mou_8 99999 non-null float64 32 std_og_t2m_mou_8 99999 non-null float64 33 std_og_t2f_mou_8 99999 non-null float64 34 std_og_mou_8 99999 non-null float64 35 isd_og_mou_8 99999 non-null float64 36 spl_og_mou_8 99999 non-null float64 37 og_others_8 99999 non-null float64 38 total_og_mou_8 99999 non-null float64 39 loc_ic_t2t_mou_8 99999 non-null float64 40 loc_ic_t2m_mou_8 99999 non-null float64 41 loc_ic_t2f_mou_8 99999 non-null float64 42 loc_ic_mou_8 99999 non-null float64 43 std_ic_t2t_mou_8 99999 non-null float64 44 std_ic_t2m_mou_8 99999 non-null float64 45 std_ic_t2f_mou_8 99999 non-null float64 46 std_ic_mou_8 99999 non-null float64 47 total_ic_mou_8 99999 non-null float64 48 spl_ic_mou_8 99999 non-null float64 49 isd_ic_mou_8 99999 non-null float64 50 ic_others_8 99999 non-null float64 51 vol_2g_mb_8 99999 non-null float64 52 vol_3g_mb_8 99999 non-null float64 53 aug_vbc_3g 99999 non-null float64 54 arpu_good_ph 99999 non-null float64 55 spl_ic_mou_good_ph 99999 non-null float64 56 isd_ic_mou_good_ph 99999 non-null float64 57 roam_ic_mou_good_ph 99999 non-null float64 58 std_ic_t2m_mou_good_ph 99999 non-null float64 59 std_ic_t2t_mou_good_ph 99999 non-null float64 60 std_ic_t2f_mou_good_ph 99999 non-null float64 61 loc_ic_t2m_mou_good_ph 99999 non-null float64 62 loc_ic_t2t_mou_good_ph 99999 non-null float64 63 loc_ic_t2f_mou_good_ph 99999 non-null float64 64 offnet_mou_good_ph 99999 non-null float64 65 onnet_mou_good_ph 99999 non-null float64 66 og_others_good_ph 99999 non-null float64 67 ic_others_good_ph 99999 non-null float64 68 spl_og_mou_good_ph 99999 non-null float64 69 isd_og_mou_good_ph 99999 non-null float64 70 roam_og_mou_good_ph 99999 non-null float64 71 std_og_t2m_mou_good_ph 99999 non-null float64 72 std_og_t2t_mou_good_ph 99999 non-null float64 73 std_og_t2f_mou_good_ph 99999 non-null float64 74 loc_og_t2m_mou_good_ph 99999 non-null float64 75 loc_og_t2t_mou_good_ph 99999 non-null float64 76 loc_og_t2f_mou_good_ph 99999 non-null float64 77 loc_og_t2c_mou_good_ph 99999 non-null float64 78 total_ic_mou_good_ph 99999 non-null float64 79 total_og_mou_good_ph 99999 non-null float64 80 vbc_3g_good_ph 99999 non-null float64 81 std_ic_mou_good_ph 99999 non-null float64 82 loc_ic_mou_good_ph 99999 non-null float64 83 std_og_mou_good_ph 99999 non-null float64 84 loc_og_mou_good_ph 99999 non-null float64 85 vol_2g_mb_good_ph 99999 non-null float64 86 vol_3g_mb_good_ph 99999 non-null float64 87 churn 99999 non-null int64 88 action_required 99999 non-null int64 dtypes: float64(75), int64(11), object(3) memory usage: 67.9+ MB None
threshold_value=master_frame['tot_rech_amt_good_ph'].quantile(.70)
master_frame['high_value_customer']=master_frame['tot_rech_amt_good_ph'].apply(lambda x: True if x==True else ( True if x > threshold_value else False))
master_frame['high_value_customer'].value_counts()
high_value_customer False 70020 True 29979 Name: count, dtype: int64
master_frame['churn'].value_counts()
churn 0 89808 1 10191 Name: count, dtype: int64
## Filtering HVC data from master_frame
master_frame_HVC=master_frame[master_frame['high_value_customer']==True]
master_frame_HVC.shape
(29979, 90)
master_frame_HVC=master_frame_HVC.set_index('mobile_number',drop=True)
master_frame_HVC.head()
| total_rech_num_8 | total_rech_amt_8 | max_rech_amt_8 | last_day_rch_amt_8 | monthly_2g_8 | sachet_2g_8 | monthly_3g_8 | sachet_3g_8 | aon_years | sachet_2g_good_ph | ... | vbc_3g_good_ph | std_ic_mou_good_ph | loc_ic_mou_good_ph | std_og_mou_good_ph | loc_og_mou_good_ph | vol_2g_mb_good_ph | vol_3g_mb_good_ph | churn | action_required | high_value_customer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mobile_number | |||||||||||||||||||||
| 7000701601 | 7 | 3638 | 1580 | 779 | 0 | 0 | 0 | 0 | 2.0 | 0.0 | ... | 19.060 | 176.810 | 391.555 | 68.000 | 478.035 | 0.0 | 0.000 | 1 | 1 | True |
| 7001524846 | 14 | 120 | 30 | 10 | 0 | 3 | 0 | 0 | 0.0 | 0.5 | ... | 516.405 | 13.545 | 52.630 | 136.870 | 333.135 | 178.0 | 375.475 | 0 | 1 | True |
| 7002191713 | 11 | 717 | 130 | 0 | 0 | 3 | 0 | 0 | 7.0 | 0.0 | ... | 0.000 | 245.950 | 120.845 | 420.355 | 129.395 | 0.0 | 0.000 | 0 | 0 | True |
| 7000875565 | 2 | 160 | 130 | 130 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | ... | 12.170 | 119.860 | 253.345 | 48.990 | 265.775 | 0.0 | 0.000 | 0 | 1 | True |
| 7000187447 | 4 | 30 | 30 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | ... | 0.000 | 4.955 | 42.135 | 574.635 | 48.770 | 0.0 | 0.000 | 0 | 1 | True |
5 rows × 89 columns
#Churn counts for HVC customers
master_frame_HVC['churn'].value_counts()
churn 0 27390 1 2589 Name: count, dtype: int64
churn_percentage=(master_frame_HVC['churn'].value_counts()*100.0 /len(master_frame_HVC)),2
print(churn_percentage)
#churn_rate_graph = (master_frame_HVC['churn'].value_counts()*100.0 /len(master_frame_HVC)).plot(kind='bar',stacked = True,rot = 0,figsize = (8,6))
#churn_rate_graph.set_ylabel('% Customers',size = 14)
#churn_rate_graph.set_xlabel('Non_Churn Vs churn',size = 14)
#churn_rate_graph.set_title('churn Rate', size = 14)
sns.barplot(x="churn", y="churn", data=master_frame_HVC, estimator=lambda x: len(x) / len(master_frame_HVC) * 100)
plt.title("Non_Churn rate Vs churn rate")
(churn 0 91.363955 1 8.636045 Name: count, dtype: float64, 2)
Text(0.5, 1.0, 'Non_Churn rate Vs churn rate')
#Churn comparisions on other columns
#sns.boxplot(x=df.churn, y="aon_years", data=df)
#plt.title("Churn stats based on age on network")
aon_churn_hvc_counts=master_frame_HVC.pivot_table(index=['aon_years'],values=['high_value_customer','churn'], aggfunc='sum')
print(aon_churn_hvc_counts)
#plt.subplot(1,1, 1)
usa=sns.barplot(x=aon_churn_hvc_counts.index, y="churn", data=aon_churn_hvc_counts)
plt.title("Churn count based on age on network")
churn high_value_customer aon_years 0.0 672 4853 1.0 818 7616 2.0 504 4661 3.0 197 3044 4.0 110 1980 5.0 51 1251 6.0 78 2069 7.0 73 1826 8.0 53 1523 9.0 19 572 10.0 14 556 11.0 0 28
Text(0.5, 1.0, 'Churn count based on age on network')
#Churn stats based on monthly ARPU
plt.figure(1)
plt.subplot(1,1, 1)
sns.boxplot(x=master_frame_HVC.churn, y="arpu_good_ph", data=master_frame_HVC, showfliers=False)
plt.title("Churn stats based on arpu in good phase")
plt.figure(2)
plt.subplot(1, 2, 2)
sns.boxplot(x=master_frame_HVC.churn, y="arpu_8", data=master_frame_HVC, showfliers=False)
plt.title("Churn stats based on arpu in August")
Text(0.5, 1.0, 'Churn stats based on arpu in August')
#Churn stats based on monthly Total recharge
plt.figure(1)
plt.subplot(1,1, 1)
sns.boxplot(x=master_frame_HVC.churn, y="tot_rech_amt_good_ph", data=master_frame_HVC, showfliers=False)
plt.title("Churn stats based on total recharge in good phase")
plt.figure(2)
plt.subplot(1, 2, 2)
sns.boxplot(x=master_frame_HVC.churn, y="total_rech_amt_8", data=master_frame_HVC, showfliers=False)
plt.title("Churn stats based on total recharge in August")
Text(0.5, 1.0, 'Churn stats based on total recharge in August')
#Churn stats based on day of last recharge
last_day_rech_hvc_count_6=master_frame_HVC.pivot_table(index=['day_of_last_rech_6'],values=['high_value_customer','churn'], aggfunc='sum')
last_day_rech_hvc_count_7=master_frame_HVC.pivot_table(index=['day_of_last_rech_7'],values=['high_value_customer','churn'], aggfunc='sum')
last_day_rech_hvc_count_8=master_frame_HVC.pivot_table(index=['day_of_last_rech_8'],values=['high_value_customer','churn'], aggfunc='sum')
plt.figure(1)
plt.rcParams['figure.figsize']=(5,5)
plt.subplot(1,1, 1)
sns.lineplot(x=last_day_rech_hvc_count_6.index, y="churn", data=last_day_rech_hvc_count_6, )
plt.title("Churn stats based on day of last recharge in June")
plt.figure(2)
plt.rcParams['figure.figsize']=(5,5)
plt.subplot(1,1, 1)
sns.lineplot(x=last_day_rech_hvc_count_7.index, y="churn", data=last_day_rech_hvc_count_7)
plt.title("Churn stats based on day of last recharge in July")
plt.figure(3)
plt.rcParams['figure.figsize']=(5,5)
#plt.subplot(1,1, 1)
sns.lineplot(x=last_day_rech_hvc_count_8.index, y="churn", data=last_day_rech_hvc_count_8)
plt.title("Churn stats based on day of last recharge in August")
Text(0.5, 1.0, 'Churn stats based on day of last recharge in August')
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Let's see the correlation matrix
plt.figure(figsize = (30,30)) # Size of the figure
sns.heatmap(master_frame_HVC.corr(),annot = True)
plt.show()
# To handle class imbalance.
logregcv = LogisticRegressionCV(class_weight='balanced',cv=20)
from sklearn.model_selection import train_test_split
X=master_frame_HVC.drop(['churn'],axis=1)
y=master_frame_HVC['churn']
y.head()
mobile_number 7000701601 1 7001524846 0 7002191713 0 7000875565 0 7000187447 0 Name: churn, dtype: int64
X_train , X_test , y_train , y_test = train_test_split(X,y,train_size=0.7, random_state=42,stratify=y)
X_train.head()
| total_rech_num_8 | total_rech_amt_8 | max_rech_amt_8 | last_day_rch_amt_8 | monthly_2g_8 | sachet_2g_8 | monthly_3g_8 | sachet_3g_8 | aon_years | sachet_2g_good_ph | ... | total_og_mou_good_ph | vbc_3g_good_ph | std_ic_mou_good_ph | loc_ic_mou_good_ph | std_og_mou_good_ph | loc_og_mou_good_ph | vol_2g_mb_good_ph | vol_3g_mb_good_ph | action_required | high_value_customer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mobile_number | |||||||||||||||||||||
| 7000669104 | 6 | 546 | 130 | 0 | 0 | 0 | 0 | 0 | 4.0 | 0.0 | ... | 1848.215 | 0.000 | 45.270 | 77.005 | 1721.110 | 106.90 | 0.865 | 512.33 | 1 | True |
| 7000402026 | 8 | 824 | 154 | 150 | 1 | 0 | 0 | 0 | 2.0 | 0.0 | ... | 301.160 | 0.000 | 67.860 | 47.880 | 236.930 | 55.67 | 330.245 | 0.71 | 1 | True |
| 7001391178 | 21 | 764 | 130 | 23 | 0 | 0 | 0 | 0 | 3.0 | 0.0 | ... | 90.930 | 76.075 | 16.630 | 135.855 | 4.155 | 80.23 | 0.000 | 0.00 | 0 | True |
| 7002018307 | 6 | 650 | 130 | 130 | 0 | 0 | 0 | 0 | 6.0 | 0.0 | ... | 434.210 | 0.000 | 23.410 | 130.000 | 106.900 | 322.47 | 0.000 | 0.00 | 1 | True |
| 7001748476 | 11 | 605 | 169 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0.0 | ... | 727.720 | 0.000 | 1.455 | 26.880 | 694.745 | 32.51 | 0.000 | 0.00 | 0 | True |
5 rows × 88 columns
X_train.shape
(20985, 88)
X_test.shape
(8994, 88)
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)
X_train = pd.DataFrame(X_train, columns=X.columns.tolist())
X_test = pd.DataFrame(X_test, columns=X.columns.tolist())
X_train.head()
| total_rech_num_8 | total_rech_amt_8 | max_rech_amt_8 | last_day_rch_amt_8 | monthly_2g_8 | sachet_2g_8 | monthly_3g_8 | sachet_3g_8 | aon_years | sachet_2g_good_ph | ... | total_og_mou_good_ph | vbc_3g_good_ph | std_ic_mou_good_ph | loc_ic_mou_good_ph | std_og_mou_good_ph | loc_og_mou_good_ph | vol_2g_mb_good_ph | vol_3g_mb_good_ph | action_required | high_value_customer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.446222 | -0.120083 | -0.187419 | -0.645018 | -0.318315 | -0.327589 | -0.29811 | -0.142157 | 0.382464 | -0.322029 | ... | 1.898281 | -0.349485 | -0.050728 | -0.563974 | 2.299885 | -0.490779 | -0.302375 | 0.328327 | 0.784442 | 0.0 |
| 1 | -0.234334 | 0.410158 | -0.050759 | 0.368976 | 2.470764 | -0.327589 | -0.29811 | -0.142157 | -0.354272 | -0.322029 | ... | -0.650806 | -0.349485 | 0.119928 | -0.661131 | -0.294499 | -0.634671 | 0.963922 | -0.352741 | 0.784442 | 0.0 |
| 2 | 1.142938 | 0.295717 | -0.187419 | -0.489539 | -0.318315 | -0.327589 | -0.29811 | -0.142157 | 0.014096 | -0.322029 | ... | -0.997202 | -0.145330 | -0.267088 | -0.367658 | -0.701396 | -0.565689 | -0.305701 | -0.353686 | -1.274792 | 0.0 |
| 3 | -0.446222 | 0.078281 | -0.187419 | 0.233777 | -0.318315 | -0.327589 | -0.29811 | -0.142157 | 1.119200 | -0.322029 | ... | -0.431579 | -0.349485 | -0.215869 | -0.387189 | -0.521795 | 0.114703 | -0.305701 | -0.353686 | 0.784442 | 0.0 |
| 4 | 0.083498 | -0.007550 | 0.034653 | -0.645018 | -0.318315 | -0.327589 | -0.29811 | -0.142157 | -0.722640 | -0.322029 | ... | 0.052038 | -0.349485 | -0.381728 | -0.731184 | 0.505773 | -0.699722 | -0.305701 | -0.353686 | -1.274792 | 0.0 |
5 rows × 88 columns
X_test.head()
| total_rech_num_8 | total_rech_amt_8 | max_rech_amt_8 | last_day_rch_amt_8 | monthly_2g_8 | sachet_2g_8 | monthly_3g_8 | sachet_3g_8 | aon_years | sachet_2g_good_ph | ... | total_og_mou_good_ph | vbc_3g_good_ph | std_ic_mou_good_ph | loc_ic_mou_good_ph | std_og_mou_good_ph | loc_og_mou_good_ph | vol_2g_mb_good_ph | vol_3g_mb_good_ph | action_required | high_value_customer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.444856 | -0.530691 | -0.392343 | -0.692552 | -0.321494 | 1.283644 | -0.295119 | -0.148593 | -0.714719 | -0.027169 | ... | 0.032974 | -0.353104 | 0.198055 | -0.551233 | 0.022477 | 0.047114 | 0.582397 | -0.309060 | 0.777585 | 0.0 |
| 1 | -1.071519 | -0.830938 | -0.984200 | -0.692552 | -0.321494 | -0.333467 | -0.295119 | -0.148593 | 1.122429 | -0.335316 | ... | -0.506944 | -0.353104 | -0.357072 | 0.162981 | -0.589694 | 0.108572 | -0.317606 | -0.309060 | -1.286033 | 0.0 |
| 2 | 1.539577 | 0.336837 | -0.579563 | -0.253797 | -0.321494 | -0.333467 | -0.295119 | -0.148593 | -0.714719 | -0.335316 | ... | 0.759307 | -0.353104 | -0.140783 | 0.287297 | 0.987603 | -0.292667 | -0.317606 | -0.309060 | -1.286033 | 0.0 |
| 3 | -0.549300 | -0.033822 | -0.078297 | 0.386354 | 2.499496 | -0.333467 | -0.295119 | -0.148593 | -0.714719 | -0.335316 | ... | -0.490456 | 1.683022 | -0.322943 | 2.799367 | -0.694598 | 0.305518 | -0.002491 | 0.623742 | 0.777585 | 0.0 |
| 4 | -0.653744 | 0.165457 | 0.525639 | 1.105625 | -0.321494 | -0.333467 | -0.295119 | -0.148593 | 1.857289 | -0.335316 | ... | 0.252899 | -0.353104 | 0.838203 | 0.951118 | -0.514479 | 1.271098 | -0.317606 | -0.309060 | 0.777585 | 0.0 |
5 rows × 88 columns
pca = PCA(0.90)
X_train_pca=pca.fit_transform(X_train)
X_train_pca.shape
(20985, 40)
pca.components_
array([[ 0.07633853, 0.22566864, 0.13663787, ..., 0.03945671,
-0.03219828, 0. ],
[ 0.15513212, 0.09444172, -0.06123446, ..., -0.08169126,
-0.04890042, -0. ],
[-0.05271368, 0.17236309, 0.21582728, ..., 0.27568083,
0.00317266, -0. ],
...,
[ 0.20732827, -0.07185651, 0.00651994, ..., 0.09591715,
-0.14249913, 0. ],
[-0.08092157, 0.04718852, -0.02644475, ..., 0.1039233 ,
0.0244161 , 0. ],
[-0.0472879 , -0.0264921 , -0.04764544, ..., 0.07186826,
-0.32915986, 0. ]])
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head(30)
| PC1 | PC2 | Feature | |
|---|---|---|---|
| 0 | 0.076339 | 0.155132 | total_rech_num_8 |
| 1 | 0.225669 | 0.094442 | total_rech_amt_8 |
| 2 | 0.136638 | -0.061234 | max_rech_amt_8 |
| 3 | 0.109444 | -0.053867 | last_day_rch_amt_8 |
| 4 | 0.014793 | -0.062349 | monthly_2g_8 |
| 5 | -0.002771 | 0.011097 | sachet_2g_8 |
| 6 | 0.056406 | -0.070434 | monthly_3g_8 |
| 7 | 0.006635 | 0.001019 | sachet_3g_8 |
| 8 | 0.049381 | -0.105510 | aon_years |
| 9 | -0.013258 | 0.003086 | sachet_2g_good_ph |
| 10 | 0.000050 | -0.005416 | sachet_3g_good_ph |
| 11 | 0.005280 | -0.074147 | monthly_2g_good_ph |
| 12 | 0.042446 | -0.077806 | monthly_3g_good_ph |
| 13 | 0.107142 | -0.091951 | max_rech_amt_good_ph |
| 14 | 0.042225 | 0.142088 | total_rech_num_good_ph |
| 15 | 0.203252 | 0.088775 | tot_rech_amt_good_ph |
| 16 | 0.092732 | -0.065286 | last_day_rch_amt_good_ph |
| 17 | 0.018674 | 0.083224 | day_of_last_rech_6 |
| 18 | 0.045441 | 0.083241 | day_of_last_rech_7 |
| 19 | 0.056599 | 0.089709 | day_of_last_rech_8 |
| 20 | 0.227566 | 0.094874 | arpu_8 |
| 21 | 0.114218 | 0.193629 | onnet_mou_8 |
| 22 | 0.176213 | 0.145429 | offnet_mou_8 |
| 23 | 0.014691 | 0.009111 | roam_ic_mou_8 |
| 24 | 0.020832 | 0.034103 | roam_og_mou_8 |
| 25 | 0.140692 | -0.015786 | loc_og_t2t_mou_8 |
| 26 | 0.200935 | -0.078544 | loc_og_t2m_mou_8 |
| 27 | 0.086528 | -0.082607 | loc_og_t2f_mou_8 |
| 28 | 0.018994 | 0.041399 | loc_og_t2c_mou_8 |
| 29 | 0.223691 | -0.064883 | loc_og_mou_8 |
corrmat = np.corrcoef(X_train_pca.transpose())
corrmat.shape
(40, 40)
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0
max corr: 1.4080458222073042e-15 , min corr: -1.3545759810548892e-15
#Applying selected components to the test data - 16 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape
(8994, 40)
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(X_train_pca,y_train)
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
'0.89'
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
'0.89'
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
confusion_matrix(y_train,y_train_pred_bool)
array([[15923, 3250],
[ 313, 1499]], dtype=int64)
confusion_matrix(y_test,y_pred_bool)
array([[6856, 1361],
[ 139, 638]], dtype=int64)
precision_score(y_test,y_pred_bool)
0.31915957978989495
recall_score(y_test,y_pred_bool)
0.8211068211068211
accuracy_score(y_test,y_pred_bool)
0.8332221480987325
#Function to calculate Model Metrics
def get_model_metrics(actual,predicted):
"""
Print Summary Metrics of the Model
Parameters:
actual (pandas.core.series.Series): Series of Boolean values for target column
predicted (pandas.core.series.Series): Series of Boolean values for Model predicted the target column
Returns:
None
"""
confusion = metrics.confusion_matrix(actual,predicted )
TP = confusion[1,1] # true positive
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives
accuracy=metrics.accuracy_score(actual,predicted)
sensitivity = TP / float(TP+FN)
specificity = TN / float(TN+FP)
precision=TP/float(FP+TP)
recall=TP/float(FN+TP)
precision=metrics.precision_score(actual,predicted)
recall=metrics.recall_score(actual,predicted)
f1=metrics.f1_score(actual,predicted, average='weighted')
print("Accuracy : "+str(round(accuracy,2)))
print("Sensitivity : "+str(round(sensitivity,2)))
print("Specificity : "+str(round(specificity,2)))
print("Precision : "+str(round(precision,2)))
print("Recall : "+str(round(recall,2)))
print("F1_score : "+str(round(f1,2)))
def get_cross_validated_model_metrics(X,y,cv=5):
"""
Get cross validated model metric for k folds
Parameters:
X (pandas.core.frame.DataFrame): DF of all the features excluding target column
y (pandas.core.series.Series): Series of Boolean values of the target column
Returns:
df (pandas.core.frame.DataFrame): DF will all the metric for k fold
"""
accuracy=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='accuracy')
precision=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='precision')
recall=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='recall')
f1_weighted=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='f1_weighted')
roc_auc=cross_val_score(logregcv, X, y, cv=cv,n_jobs=-1,scoring='roc_auc')
df=pd.DataFrame(
{'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1_weighted': f1_weighted,
'roc_auc': roc_auc,
})
return df
get_model_metrics(y_test,y_pred_bool)
Accuracy : 0.83 Sensitivity : 0.82 Specificity : 0.83 Precision : 0.32 Recall : 0.82 F1_score : 0.86
pca = PCA(0.85)
X_train_pca=pca.fit_transform(X_train)
X_train_pca.shape
(20985, 33)
pca.components_
array([[ 0.07633853, 0.22566864, 0.13663787, ..., 0.03945671,
-0.03219828, 0. ],
[ 0.15513212, 0.09444172, -0.06123446, ..., -0.08169126,
-0.04890042, -0. ],
[-0.05271368, 0.17236309, 0.21582728, ..., 0.27568083,
0.00317266, -0. ],
...,
[ 0.08794035, 0.08126948, -0.063038 , ..., -0.01767814,
0.24643063, 0. ],
[-0.0551191 , -0.14849084, -0.10057557, ..., 0.11180516,
0.09907325, -0. ],
[-0.3403509 , -0.03490944, 0.04207936, ..., -0.01395101,
-0.02831194, 0. ]])
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head()
| PC1 | PC2 | Feature | |
|---|---|---|---|
| 0 | 0.076339 | 0.155132 | total_rech_num_8 |
| 1 | 0.225669 | 0.094442 | total_rech_amt_8 |
| 2 | 0.136638 | -0.061234 | max_rech_amt_8 |
| 3 | 0.109444 | -0.053867 | last_day_rch_amt_8 |
| 4 | 0.014793 | -0.062349 | monthly_2g_8 |
corrmat = np.corrcoef(X_train_pca.transpose())
corrmat.shape
(33, 33)
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0
max corr: 1.0844956478739332e-15 , min corr: -1.3545759810548892e-15
#Applying selected components to the test data - 16 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape
(8994, 33)
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(X_train_pca,y_train)
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
'0.89'
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
'0.89'
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
confusion_matrix(y_test,y_pred_bool)
array([[6805, 1412],
[ 142, 635]], dtype=int64)
confusion_matrix(y_train,y_train_pred_bool)
array([[15828, 3345],
[ 325, 1487]], dtype=int64)
import sklearn.metrics
precision_score(y_test,y_pred_bool)
0.3102100635075721
recall_score(y_test,y_pred_bool)
0.8172458172458172
accuracy_score(y_test,y_pred_bool)
0.8272181454302868
get_model_metrics(y_test,y_pred_bool)
Accuracy : 0.83 Sensitivity : 0.82 Specificity : 0.83 Precision : 0.31 Recall : 0.82 F1_score : 0.86
pca = PCA(0.80)
X_train_pca=pca.fit_transform(X_train)
X_train_pca.shape
(20985, 28)
pca.components_
array([[ 0.07633853, 0.22566864, 0.13663787, ..., 0.03945671,
-0.03219828, 0. ],
[ 0.15513212, 0.09444172, -0.06123446, ..., -0.08169126,
-0.04890042, -0. ],
[-0.05271368, 0.17236309, 0.21582728, ..., 0.27568083,
0.00317266, -0. ],
...,
[-0.04948918, 0.04932912, -0.04596317, ..., -0.04633158,
-0.04737415, -0. ],
[-0.08773346, -0.01372893, -0.01463961, ..., -0.00117623,
0.20125531, -0. ],
[-0.13545784, -0.04061064, -0.12696834, ..., 0.01968636,
0.06213658, 0. ]])
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head()
| PC1 | PC2 | Feature | |
|---|---|---|---|
| 0 | 0.076339 | 0.155132 | total_rech_num_8 |
| 1 | 0.225669 | 0.094442 | total_rech_amt_8 |
| 2 | 0.136638 | -0.061234 | max_rech_amt_8 |
| 3 | 0.109444 | -0.053867 | last_day_rch_amt_8 |
| 4 | 0.014793 | -0.062349 | monthly_2g_8 |
corrmat = np.corrcoef(X_train_pca.transpose())
corrmat.shape
(28, 28)
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0
max corr: 1.0844956478739332e-15 , min corr: -1.5184103551529068e-15
#Applying selected components to the test data - 16 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape
(8994, 28)
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(X_train_pca,y_train)
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
'0.88'
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
'0.88'
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
confusion_matrix(y_test,y_pred_bool)
array([[6843, 1374],
[ 144, 633]], dtype=int64)
confusion_matrix(y_train,y_train_pred_bool)
array([[15872, 3301],
[ 347, 1465]], dtype=int64)
import sklearn.metrics
precision_score(y_test,y_pred_bool)
0.3153961136023916
recall_score(y_test,y_pred_bool)
0.8146718146718147
accuracy_score(y_test,y_pred_bool)
0.8312208138759173
get_model_metrics(y_test,y_pred_bool)
Accuracy : 0.83 Sensitivity : 0.81 Specificity : 0.83 Precision : 0.32 Recall : 0.81 F1_score : 0.86
X_train.shape
(20985, 88)
#Improting the PCA module
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized', random_state=42)
X_train_pca=pca.fit_transform(X_train)
X_train_pca.shape
(20985, 88)
pca.components_
array([[ 7.63385296e-02, 2.25668637e-01, 1.36637874e-01, ...,
3.94567053e-02, -3.21982809e-02, 0.00000000e+00],
[ 1.55132118e-01, 9.44417182e-02, -6.12344615e-02, ...,
-8.16912579e-02, -4.89004224e-02, 0.00000000e+00],
[-5.27136831e-02, 1.72363091e-01, 2.15827278e-01, ...,
2.75680826e-01, 3.17265679e-03, -0.00000000e+00],
...,
[-5.38011643e-08, 1.69099296e-07, -1.18763814e-07, ...,
1.10214967e-08, 1.16673514e-07, 0.00000000e+00],
[-7.49304793e-08, 1.20074454e-07, -3.78340539e-08, ...,
-4.04387345e-08, 4.89967732e-10, 0.00000000e+00],
[ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head(30)
| PC1 | PC2 | Feature | |
|---|---|---|---|
| 0 | 0.076339 | 0.155132 | total_rech_num_8 |
| 1 | 0.225669 | 0.094442 | total_rech_amt_8 |
| 2 | 0.136638 | -0.061234 | max_rech_amt_8 |
| 3 | 0.109444 | -0.053867 | last_day_rch_amt_8 |
| 4 | 0.014793 | -0.062349 | monthly_2g_8 |
| 5 | -0.002771 | 0.011097 | sachet_2g_8 |
| 6 | 0.056406 | -0.070434 | monthly_3g_8 |
| 7 | 0.006635 | 0.001019 | sachet_3g_8 |
| 8 | 0.049381 | -0.105510 | aon_years |
| 9 | -0.013258 | 0.003086 | sachet_2g_good_ph |
| 10 | 0.000050 | -0.005416 | sachet_3g_good_ph |
| 11 | 0.005280 | -0.074147 | monthly_2g_good_ph |
| 12 | 0.042446 | -0.077806 | monthly_3g_good_ph |
| 13 | 0.107142 | -0.091951 | max_rech_amt_good_ph |
| 14 | 0.042225 | 0.142088 | total_rech_num_good_ph |
| 15 | 0.203252 | 0.088775 | tot_rech_amt_good_ph |
| 16 | 0.092732 | -0.065286 | last_day_rch_amt_good_ph |
| 17 | 0.018674 | 0.083224 | day_of_last_rech_6 |
| 18 | 0.045441 | 0.083241 | day_of_last_rech_7 |
| 19 | 0.056599 | 0.089709 | day_of_last_rech_8 |
| 20 | 0.227566 | 0.094874 | arpu_8 |
| 21 | 0.114218 | 0.193629 | onnet_mou_8 |
| 22 | 0.176213 | 0.145429 | offnet_mou_8 |
| 23 | 0.014691 | 0.009111 | roam_ic_mou_8 |
| 24 | 0.020832 | 0.034103 | roam_og_mou_8 |
| 25 | 0.140692 | -0.015786 | loc_og_t2t_mou_8 |
| 26 | 0.200935 | -0.078544 | loc_og_t2m_mou_8 |
| 27 | 0.086528 | -0.082607 | loc_og_t2f_mou_8 |
| 28 | 0.018994 | 0.041399 | loc_og_t2c_mou_8 |
| 29 | 0.223691 | -0.064883 | loc_og_mou_8 |
pca.explained_variance_ratio_
array([1.05523190e-01, 9.35620165e-02, 6.07000296e-02, 4.87794462e-02,
4.36956587e-02, 3.84396347e-02, 3.10280137e-02, 2.91858432e-02,
2.81325159e-02, 2.74104374e-02, 2.63263170e-02, 2.52636769e-02,
2.42871750e-02, 2.30059535e-02, 2.13248453e-02, 2.02745458e-02,
1.88465408e-02, 1.67142391e-02, 1.47504540e-02, 1.43684902e-02,
1.39366438e-02, 1.34973248e-02, 1.30978332e-02, 1.24556228e-02,
1.20125318e-02, 1.07138470e-02, 1.00918448e-02, 9.86099508e-03,
9.73066836e-03, 9.44571953e-03, 9.28097814e-03, 8.58852311e-03,
8.52420458e-03, 8.08950927e-03, 7.99544103e-03, 7.68923889e-03,
7.36964213e-03, 7.23625935e-03, 6.87277474e-03, 6.81889637e-03,
6.64771908e-03, 6.18745755e-03, 5.77083420e-03, 5.35067544e-03,
5.10346473e-03, 4.96276039e-03, 4.55836387e-03, 3.74719120e-03,
3.65611275e-03, 3.49448213e-03, 3.39702242e-03, 3.28286973e-03,
3.14252201e-03, 3.05116189e-03, 3.00821935e-03, 2.74749351e-03,
2.67793486e-03, 2.63332878e-03, 2.51357162e-03, 2.43411028e-03,
2.39974934e-03, 2.29246439e-03, 2.14026916e-03, 2.09463542e-03,
2.07176829e-03, 1.67123315e-03, 1.57515410e-03, 8.73261202e-04,
8.16787618e-04, 4.00894160e-04, 2.62314395e-04, 7.50628962e-05,
2.69947183e-05, 3.68946684e-06, 9.03722548e-07, 1.16342153e-11,
5.22446200e-12, 4.11473552e-12, 2.11395763e-12, 1.73379951e-12,
1.54089522e-12, 9.75475645e-13, 8.98816972e-13, 7.05871034e-13,
4.17916770e-13, 2.84710775e-13, 1.80504538e-13, 4.17814700e-34])
#Making the screeplot - plotting the cumulative variance against the number of components
%matplotlib inline
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
from sklearn.decomposition import IncrementalPCA
pca = IncrementalPCA(n_components=40)
X_train_pca=pca.fit_transform(X_train)
X_train_pca.shape
(20985, 40)
pca.components_
array([[ 0.07629758, 0.2256092 , 0.13664919, ..., 0.03949844,
-0.03227046, -0. ],
[ 0.15517044, 0.09443407, -0.06122161, ..., -0.08173608,
-0.04885412, 0. ],
[-0.05255816, 0.17220162, 0.21603692, ..., 0.27583064,
0.00318245, -0. ],
...,
[-0.01347025, -0.01290335, 0.04479986, ..., 0.02463924,
0.08937706, 0. ],
[ 0.08466575, -0.05716234, 0.01075553, ..., -0.24198979,
-0.15457304, 0. ],
[-0.078044 , -0.01481756, 0.07862538, ..., 0.28532987,
-0.19249444, 0. ]])
colnames = list(X_train.columns)
pcs_df = pd.DataFrame({'PC1':pca.components_[0],'PC2':pca.components_[1], 'Feature':colnames})
pcs_df.head(30)
| PC1 | PC2 | Feature | |
|---|---|---|---|
| 0 | 0.076298 | 0.155170 | total_rech_num_8 |
| 1 | 0.225609 | 0.094434 | total_rech_amt_8 |
| 2 | 0.136649 | -0.061222 | max_rech_amt_8 |
| 3 | 0.109520 | -0.053851 | last_day_rch_amt_8 |
| 4 | 0.014749 | -0.062273 | monthly_2g_8 |
| 5 | -0.002801 | 0.011119 | sachet_2g_8 |
| 6 | 0.056479 | -0.070432 | monthly_3g_8 |
| 7 | 0.006599 | 0.001022 | sachet_3g_8 |
| 8 | 0.049391 | -0.105521 | aon_years |
| 9 | -0.013189 | 0.003058 | sachet_2g_good_ph |
| 10 | 0.000099 | -0.005414 | sachet_3g_good_ph |
| 11 | 0.005376 | -0.074215 | monthly_2g_good_ph |
| 12 | 0.042528 | -0.077735 | monthly_3g_good_ph |
| 13 | 0.107139 | -0.091909 | max_rech_amt_good_ph |
| 14 | 0.042244 | 0.142112 | total_rech_num_good_ph |
| 15 | 0.203216 | 0.088813 | tot_rech_amt_good_ph |
| 16 | 0.092682 | -0.065364 | last_day_rch_amt_good_ph |
| 17 | 0.018665 | 0.083224 | day_of_last_rech_6 |
| 18 | 0.045425 | 0.083261 | day_of_last_rech_7 |
| 19 | 0.056687 | 0.089654 | day_of_last_rech_8 |
| 20 | 0.227478 | 0.094871 | arpu_8 |
| 21 | 0.114232 | 0.193582 | onnet_mou_8 |
| 22 | 0.176185 | 0.145456 | offnet_mou_8 |
| 23 | 0.014779 | 0.009159 | roam_ic_mou_8 |
| 24 | 0.020704 | 0.034018 | roam_og_mou_8 |
| 25 | 0.140618 | -0.015793 | loc_og_t2t_mou_8 |
| 26 | 0.200931 | -0.078521 | loc_og_t2m_mou_8 |
| 27 | 0.086519 | -0.082582 | loc_og_t2f_mou_8 |
| 28 | 0.019088 | 0.041479 | loc_og_t2c_mou_8 |
| 29 | 0.223641 | -0.064872 | loc_og_mou_8 |
%matplotlib inline
fig = plt.figure(figsize = (8,8))
plt.scatter(pcs_df.PC1, pcs_df.PC2)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
for i, txt in enumerate(pcs_df.Feature):
plt.annotate(txt, (pcs_df.PC1[i],pcs_df.PC2[i]))
plt.tight_layout()
plt.show()
pca.explained_variance_ratio_
array([0.10552123, 0.09356073, 0.06069537, 0.0487609 , 0.04369076,
0.03843441, 0.03098464, 0.02915601, 0.02809563, 0.0273662 ,
0.02627115, 0.02524961, 0.02425371, 0.02290334, 0.02125699,
0.02026422, 0.01883319, 0.01656114, 0.01469295, 0.01428665,
0.01381256, 0.01341614, 0.01264494, 0.0122565 , 0.0119078 ,
0.01067961, 0.0100218 , 0.00979932, 0.00938025, 0.00924034,
0.00855022, 0.00827311, 0.00802292, 0.0074819 , 0.0073239 ,
0.00710277, 0.00677336, 0.00647 , 0.00622158, 0.0057558 ])
%matplotlib inline
fig = plt.figure(figsize = (8,8))
plt.scatter(X_train_pca[:,0], X_train_pca[:,1], c = y_train.map({0:'green',1:'red'}))
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.tight_layout()
plt.show()
X_train.head()
| total_rech_num_8 | total_rech_amt_8 | max_rech_amt_8 | last_day_rch_amt_8 | monthly_2g_8 | sachet_2g_8 | monthly_3g_8 | sachet_3g_8 | aon_years | sachet_2g_good_ph | ... | total_og_mou_good_ph | vbc_3g_good_ph | std_ic_mou_good_ph | loc_ic_mou_good_ph | std_og_mou_good_ph | loc_og_mou_good_ph | vol_2g_mb_good_ph | vol_3g_mb_good_ph | action_required | high_value_customer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.446222 | -0.120083 | -0.187419 | -0.645018 | -0.318315 | -0.327589 | -0.29811 | -0.142157 | 0.382464 | -0.322029 | ... | 1.898281 | -0.349485 | -0.050728 | -0.563974 | 2.299885 | -0.490779 | -0.302375 | 0.328327 | 0.784442 | 0.0 |
| 1 | -0.234334 | 0.410158 | -0.050759 | 0.368976 | 2.470764 | -0.327589 | -0.29811 | -0.142157 | -0.354272 | -0.322029 | ... | -0.650806 | -0.349485 | 0.119928 | -0.661131 | -0.294499 | -0.634671 | 0.963922 | -0.352741 | 0.784442 | 0.0 |
| 2 | 1.142938 | 0.295717 | -0.187419 | -0.489539 | -0.318315 | -0.327589 | -0.29811 | -0.142157 | 0.014096 | -0.322029 | ... | -0.997202 | -0.145330 | -0.267088 | -0.367658 | -0.701396 | -0.565689 | -0.305701 | -0.353686 | -1.274792 | 0.0 |
| 3 | -0.446222 | 0.078281 | -0.187419 | 0.233777 | -0.318315 | -0.327589 | -0.29811 | -0.142157 | 1.119200 | -0.322029 | ... | -0.431579 | -0.349485 | -0.215869 | -0.387189 | -0.521795 | 0.114703 | -0.305701 | -0.353686 | 0.784442 | 0.0 |
| 4 | 0.083498 | -0.007550 | 0.034653 | -0.645018 | -0.318315 | -0.327589 | -0.29811 | -0.142157 | -0.722640 | -0.322029 | ... | 0.052038 | -0.349485 | -0.381728 | -0.731184 | 0.505773 | -0.699722 | -0.305701 | -0.353686 | -1.274792 | 0.0 |
5 rows × 88 columns
corrmat = np.corrcoef(X_train_pca.transpose())
corrmat.shape
(40, 40)
#plotting the correlation matrix
%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(corrmat[0:19,0:19],annot = True)
<Axes: >
%matplotlib inline
plt.figure(figsize = (20,10))
sns.heatmap(corrmat[19:40,19:40],annot = True)
<Axes: >
# 1s -> 0s in diagonals
corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0
max corr: 0.13631017903169826 , min corr: -0.05370040292263015
#Applying selected components to the test data - 16 components
X_test_pca = pca.transform(X_test)
X_test_pca.shape
(8994, 40)
#Training the model on the train data
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
log_reg_cv = LogisticRegressionCV(class_weight='balanced',cv=20)
model_pca = log_reg_cv.fit(X_train_pca,y_train)
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
'0.89'
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
'0.89'
from sklearn.metrics import precision_score,recall_score,accuracy_score,confusion_matrix
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
confusion_matrix(y_test,y_pred_bool)
array([[6856, 1361],
[ 142, 635]], dtype=int64)
confusion_matrix(y_train,y_train_pred_bool)
array([[15924, 3249],
[ 322, 1490]], dtype=int64)
precision_score(y_test,y_pred_bool)
0.3181362725450902
recall_score(y_test,y_pred_bool)
0.8172458172458172
accuracy_score(y_test,y_pred_bool)
0.83288859239493
get_model_metrics(y_test,y_pred_bool)
Accuracy : 0.83 Sensitivity : 0.82 Specificity : 0.83 Precision : 0.32 Recall : 0.82 F1_score : 0.86
#Training the model on the train data
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(X_train_pca,y_train)
#Making prediction on the train data
y_train_pred = model_pca.predict_proba(X_train_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_train, y_train_pred))
'0.89'
#Making prediction on the test data
y_pred = model_pca.predict_proba(X_test_pca)[:,1]
"{:2.2}".format(metrics.roc_auc_score(y_test, y_pred))
'0.89'
from sklearn.metrics import precision_score,recall_score,accuracy_score,confusion_matrix
y_pred=pd.Series(y_pred)
y_train_pred=pd.Series(y_train_pred)
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
confusion_matrix(y_test,y_pred_bool)
array([[6851, 1366],
[ 138, 639]], dtype=int64)
confusion_matrix(y_train,y_train_pred_bool)
array([[15883, 3290],
[ 311, 1501]], dtype=int64)
import sklearn.metrics
precision_score(y_test,y_pred_bool)
0.31870324189526184
recall_score(y_test,y_pred_bool)
0.8223938223938224
accuracy_score(y_test,y_pred_bool)
0.8327774071603291
def draw_roc( actual, probs ):
fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
drop_intermediate = False )
auc_score = metrics.roc_auc_score( actual, probs )
plt.figure(figsize=(5, 5))
plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
return None
fpr, tpr, thresholds = metrics.roc_curve( y_train, y_train_pred, drop_intermediate = False )
draw_roc(y_train, y_train_pred)
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix
confusion=confusion_matrix(y_train,y_train_pred_bool)
# TP = confusion[1,1] # true positive
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives
def threshold_optimization(y,num):
#pass the prediction_probability series and threshold probabilty cutoff
y_bool= y.apply(lambda x: 1 if x>=num else 0)
return y_bool
num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
cm1 = metrics.confusion_matrix(y_train,threshold_optimization(y_train_pred,i))
total1=sum(sum(cm1))
accuracy = (cm1[0,0]+cm1[1,1])/total1
speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)
prob accuracy sensi speci 0.0 0.0 0.086347 1.000000 0.000000 0.1 0.1 0.371265 0.979029 0.313827 0.2 0.2 0.542578 0.953642 0.503729 0.3 0.3 0.665142 0.915011 0.641527 0.4 0.4 0.761449 0.871965 0.751004 0.5 0.5 0.828401 0.828366 0.828405 0.6 0.6 0.871432 0.764901 0.881500 0.7 0.7 0.899309 0.682119 0.919835 0.8 0.8 0.912890 0.498896 0.952016 0.9 0.9 0.919228 0.286976 0.978981
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.3 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.3 else 0)
cm_test=confusion_matrix(y_test,y_pred_bool)
cm_train=confusion_matrix(y_train,y_train_pred_bool)
cm_train
array([[12300, 6873],
[ 154, 1658]], dtype=int64)
cm_test
array([[5306, 2911],
[ 63, 714]], dtype=int64)
get_model_metrics(y_test,y_pred_bool)
Accuracy : 0.67 Sensitivity : 0.92 Specificity : 0.65 Precision : 0.2 Recall : 0.92 F1_score : 0.74
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.4 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.4 else 0)
get_model_metrics(y_test,y_pred_bool)
Accuracy : 0.77 Sensitivity : 0.87 Specificity : 0.76 Precision : 0.25 Recall : 0.87 F1_score : 0.81
y_pred_bool= y_pred.apply(lambda x: 1 if x>=0.5 else 0)
y_train_pred_bool= y_train_pred.apply(lambda x: 1 if x>=0.5 else 0)
get_model_metrics(y_test,y_pred_bool)
Accuracy : 0.83 Sensitivity : 0.82 Specificity : 0.83 Precision : 0.32 Recall : 0.82 F1_score : 0.86
cross_validated_train_metrics=get_cross_validated_model_metrics(X_train_pca,y_train,cv=5)
cross_validated_train_metrics
| accuracy | precision | recall | f1_weighted | roc_auc | |
|---|---|---|---|---|---|
| 0 | 0.832261 | 0.317256 | 0.815427 | 0.862413 | 0.887782 |
| 1 | 0.822254 | 0.308308 | 0.848485 | 0.855716 | 0.892362 |
| 2 | 0.830593 | 0.314952 | 0.820442 | 0.861362 | 0.885959 |
| 3 | 0.820110 | 0.301715 | 0.825967 | 0.853889 | 0.887532 |
| 4 | 0.834882 | 0.318332 | 0.801105 | 0.864122 | 0.878673 |
cross_validated_train_metrics.mean()
accuracy 0.828020 precision 0.312113 recall 0.822285 f1_weighted 0.859500 roc_auc 0.886461 dtype: float64
cross_validated_test_metrics=get_cross_validated_model_metrics(X_test_pca,y_test,cv=5)
cross_validated_test_metrics
cross_validated_test_metrics.mean()